In [None]:
import pandas as pd

df = pd.read_csv("DATA/train_valid_without_errs.csv")
df.head()

In [None]:
transcripts = df["transcripts"].tolist()
len(transcripts)

In [None]:
with open("DATA/train.txt", "w") as f:
    for transcript in transcripts:
        if '\t' in transcript:
            f.write(transcript.split('\t')[0].strip() + "\n")
        else:
            f.write(transcript.strip() + "\n")

## Normalizing text corpus (transcriptions of train dataset)

In [None]:
from banglanlptoolkit import BnNLPNormalizerPlus

normalizer = BnNLPNormalizerPlus()
res = normalizer("DATA/train.txt")

## Training text tokenizer with custom data and vocab size

In [None]:
python process_asr_text_tokenizer.py --data_file="DATA/trainnormalized.txt" \
    --data_root="tokenizer" \
    --vocab_size=256 \
    --tokenizer="spe" \
    --spe_type="bpe" \
    --log

## Edit manifest file to remove extra tabs and newlines

In [1]:
import pandas as pd

df_manifest = pd.read_json("DATA/nemo_manifest.json", lines=True, orient="records")
df_manifest.head()

Unnamed: 0,audio_filepath,duration,text
0,/home/sami/workspace/nemo-asr-training/DATA/bn...,6.3,আমি এই চেষ্টাটি একদমই করিনি
1,/home/sami/workspace/nemo-asr-training/DATA/bn...,8.42,এজন্য আগামীকাল ও মঙ্গলবার মুখ্যমন্ত্রীর দপ্তর ...
2,/home/sami/workspace/nemo-asr-training/DATA/bn...,4.68,ফ্রোজেন ওয়াটার হতিছে যেটি পানি ঠান্ডায় জমে বরফ...
3,/home/sami/workspace/nemo-asr-training/DATA/cv...,2.268,এই কাজের জন্য তিনি নোবেল পুরস্কার লাভ করেন।
4,/home/sami/workspace/nemo-asr-training/DATA/cv...,3.636,তিনি অটল দাঁড়িয়ে রইলেন যখন পরাজিতরা পালিয়ে গেল।


In [3]:
df_manifest["target_lang"] = "bn"
df_manifest.head()

Unnamed: 0,audio_filepath,duration,text,target_lang
0,/home/sami/workspace/nemo-asr-training/DATA/bn...,6.3,আমি এই চেষ্টাটি একদমই করিনি,bn
1,/home/sami/workspace/nemo-asr-training/DATA/bn...,8.42,এজন্য আগামীকাল ও মঙ্গলবার মুখ্যমন্ত্রীর দপ্তর ...,bn
2,/home/sami/workspace/nemo-asr-training/DATA/bn...,4.68,ফ্রোজেন ওয়াটার হতিছে যেটি পানি ঠান্ডায় জমে বরফ...,bn
3,/home/sami/workspace/nemo-asr-training/DATA/cv...,2.268,এই কাজের জন্য তিনি নোবেল পুরস্কার লাভ করেন।,bn
4,/home/sami/workspace/nemo-asr-training/DATA/cv...,3.636,তিনি অটল দাঁড়িয়ে রইলেন যখন পরাজিতরা পালিয়ে গেল।,bn


In [2]:
df_manifest["duration"].min(), df_manifest["duration"].max()

(0.07200000000000001, 39.996)

In [None]:
# df_manifest["audio_filepath"] = "/home/sami/workspace/nemo-asr-training/DATA/" + df_manifest["audio_filepath"]
# df_manifest.head()

In [None]:
# from banglanlptoolkit import BnNLPNormalizer
# from tqdm import tqdm

# tqdm.pandas()

# normalizer = BnNLPNormalizer(allow_en=True)
# df_manifest["text"] = df_manifest["text"].progress_apply(normalizer.normalize_bn)
# df_manifest.head()

In [None]:
df_manifest["text"] = df_manifest["text"].apply(lambda x: x.split("\t")[0].strip() if "\t" in x else x.strip())
df_manifest.head()

In [None]:
df_manifest.info()

In [4]:
df_manifest.to_json("DATA/nemo_manifest.json", lines=True, orient="records", force_ascii=False)

## Split Manifest into train and validation sets

In [5]:
from sklearn.model_selection import train_test_split
import pandas as pd


df_manifest = pd.read_json("DATA/nemo_manifest.json", lines=True, orient="records")

df_manifest = df_manifest[df_manifest["duration"] < 20.0]
df_train, df_valid = train_test_split(df_manifest, test_size=0.001, random_state=42)

In [6]:
df_manifest["text"].str.len().max()

501

In [7]:
df_train.to_json("DATA/train_manifest.json", lines=True, orient="records", force_ascii=False)
df_valid.to_json("DATA/valid_manifest.json", lines=True, orient="records", force_ascii=False)

## Convert to Bucketing Dataset

In [None]:
!python convert_to_tarred_audio_dataset.py \
        --manifest_path=DATA/train_manifest.json \
        --target_dir=DATA/train_bucket \
        --num_shards=128 \
        --max_duration=40 \
        --min_duration=0.1 \
        --shuffle \
        --shuffle_seed=1 \
        --sort_in_shards \
        --workers=24 \
        --buckets_num=4

## Training

In [None]:
!python speech_to_text_hybrid_rnnt_ctc_bpe.py 

## Finetuning

In [1]:
import nemo
import nemo.collections.asr as nemo_asr
from nemo.collections.asr.metrics.wer import word_error_rate
from nemo.utils import logging, exp_manager

In [2]:
model = nemo_asr.models.ASRModel.from_pretrained("nvidia/parakeet-tdt_ctc-110m")

[NeMo I 2025-03-13 15:09:53 mixins:176] Tokenizer SentencePieceTokenizer initialized with 1024 tokens


[NeMo W 2025-03-13 15:09:56 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: null
    sample_rate: 16000
    batch_size: null
    shuffle: true
    num_workers: 8
    pin_memory: true
    max_duration: 40
    min_duration: 0.1
    is_tarred: true
    tarred_audio_filepaths: null
    shuffle_n: 2048
    bucketing_strategy: fully_randomized
    bucketing_batch_size: null
    shard_manifests: true
    use_lhotse: true
    use_bucketing: true
    num_buckets: 30
    bucket_duration_bins: null
    batch_duration: 600
    defer_setup: true
    
[NeMo W 2025-03-13 15:09:56 modelPT:183] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
  

[NeMo I 2025-03-13 15:09:56 features:305] PADDING: 0


    


[NeMo I 2025-03-13 15:09:57 rnnt_models:226] Using RNNT Loss : tdt
    Loss tdt_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0, 'durations': [0, 1, 2, 3, 4], 'sigma': 0.02, 'omega': 0.1}
[NeMo I 2025-03-13 15:09:57 rnnt_models:226] Using RNNT Loss : tdt
    Loss tdt_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0, 'durations': [0, 1, 2, 3, 4], 'sigma': 0.02, 'omega': 0.1}


[NeMo W 2025-03-13 15:09:57 tdt_loop_labels_computer:281] No conditional node support for Cuda.
    Cuda graphs with while loops are disabled, decoding speed will be slower
    Reason: No `cuda-python` module. Please do `pip install cuda-python>=12.3`


[NeMo I 2025-03-13 15:09:57 rnnt_models:226] Using RNNT Loss : tdt
    Loss tdt_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0, 'durations': [0, 1, 2, 3, 4], 'sigma': 0.02, 'omega': 0.1}


[NeMo W 2025-03-13 15:09:57 tdt_loop_labels_computer:281] No conditional node support for Cuda.
    Cuda graphs with while loops are disabled, decoding speed will be slower
    Reason: No `cuda-python` module. Please do `pip install cuda-python>=12.3`


[NeMo I 2025-03-13 15:10:00 save_restore_connector:275] Model EncDecHybridRNNTCTCBPEModel was successfully restored from /home/sami/.cache/huggingface/hub/models--nvidia--parakeet-tdt_ctc-110m/snapshots/431a349f3051ab85c22b9b7a2741b5fe77065665/parakeet-tdt_ctc-110m.nemo.


In [3]:
# Preserve the decoder parameters in case weight matching can be done later
pretrained_decoder = model.decoder.state_dict()

In [4]:
TOKENIZER_DIR = "tokenizer/tokenizer_spe_bpe_v1024"
model.change_vocabulary(new_tokenizer_dir=TOKENIZER_DIR, new_tokenizer_type="bpe")

[NeMo W 2025-03-13 15:10:00 modelPT:281] You tried to register an artifact under config key=tokenizer.model_path but an artifact for it has already been registered.
[NeMo W 2025-03-13 15:10:00 modelPT:281] You tried to register an artifact under config key=tokenizer.vocab_path but an artifact for it has already been registered.
[NeMo W 2025-03-13 15:10:00 modelPT:281] You tried to register an artifact under config key=tokenizer.spe_tokenizer_vocab but an artifact for it has already been registered.


[NeMo I 2025-03-13 15:10:00 mixins:176] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2025-03-13 15:10:01 rnnt_models:226] Using RNNT Loss : tdt
    Loss tdt_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0, 'durations': [0, 1, 2, 3, 4], 'sigma': 0.02, 'omega': 0.1}


[NeMo W 2025-03-13 15:10:01 tdt_loop_labels_computer:281] No conditional node support for Cuda.
    Cuda graphs with while loops are disabled, decoding speed will be slower
    Reason: No `cuda-python` module. Please do `pip install cuda-python>=12.3`


[NeMo I 2025-03-13 15:10:01 hybrid_rnnt_ctc_bpe_models:367] Changed tokenizer of the RNNT decoder to ['<unk>', '▁ক', 'য়', 'ার', '▁ব', '▁প', '▁স', 'ের', '্য', '্র', '▁আ', '▁এ', 'ান', '▁হ', '▁ম', '▁ত', '▁কর', '▁দ', '▁ন', '্ত', '▁জ', 'েন', '▁অ', 'কে', 'য়ে', 'তে', 'িন', '▁য', 'াল', '▁প্র', 'ছে', 'ায়', '▁গ', '▁র', 'িক', '▁শ', 'টি', 'বে', 'র্', '▁চ', 'লা', 'লে', '▁বি', 'ড়', 'াম', 'াক', '▁এক', '▁ভ', 'তি', '্ট', 'াজ', '▁উ', '্ব', '▁থ', '▁আম', 'িল', 'য়া', 'িত', '্যা', '▁নি', '▁পর', '▁বা', '▁করে', '্ষ', 'দের', 'াই', '▁কি', 'ুর', 'াস', '▁ও', '▁ফ', 'াত', 'ন্য', 'ন্', '▁দে', 'িনি', '▁সম', 'ুল', '▁খ', '▁ছ', 'নে', '▁যে', '্থ', '▁তা', 'টা', 'রা', '▁কো', '▁হয়', '▁ই', 'িয়ে', 'ির', '▁ট', 'ন্ত', 'াব', 'েকে', '▁না', '▁ল', '্প', 'ঙ্', 'েশ', '▁থেকে', 'ক্ষ', '▁তিনি', '▁সে', '▁করা', 'কার', '▁হয়ে', '▁ধ', 'চ্', '▁পা', '▁জন্য', '▁এব', 'িস', 'নের', '▁মা', 'বা', 'না', '▁এই', '▁পার', '▁এবং', 'াবে', 'েল', '▁ব্য', 'দ্', '্রী', '▁তার', 'িশ', 'ানে', 'ধ্য', 'রে', 'গে', 'ছেন', 'াকা', 'ার্', 'িতে', '▁সা', 'ক্ত', '▁

In [5]:
model.decoder

RNNTDecoder(
  (prediction): ModuleDict(
    (embed): Embedding(1025, 640, padding_idx=1024)
    (dec_rnn): LSTMDropout(
      (lstm): LSTM(640, 640, dropout=0.2)
      (dropout): Dropout(p=0.2, inplace=False)
    )
  )
)

In [6]:
import torch
import torch.nn as nn

def enable_bn_se(m):
    if type(m) == nn.BatchNorm1d:
        m.train()
        for param in m.parameters():
            param.requires_grad_(True)

    if 'SqueezeExcite' in type(m).__name__:
        m.train()
        for param in m.parameters():
            param.requires_grad_(True)

In [7]:
freeze_encoder = True

if freeze_encoder:
  model.encoder.freeze()
  model.encoder.apply(enable_bn_se)
  logging.info("Model encoder has been frozen")
else:
  model.encoder.unfreeze()
  logging.info("Model encoder has been un-frozen")

[NeMo I 2025-03-13 15:10:01 3588566142:6] Model encoder has been frozen


In [8]:
import copy

cfg = copy.deepcopy(model.cfg)

# Setup new tokenizer
cfg.tokenizer.dir = TOKENIZER_DIR
cfg.tokenizer.type = "bpe"

# Set tokenizer config
model.cfg.tokenizer = cfg.tokenizer

In [9]:
from omegaconf import OmegaConf, open_dict

# Setup train/val/test configs
print(OmegaConf.to_yaml(cfg.train_ds))

manifest_filepath: null
sample_rate: 16000
batch_size: null
shuffle: true
num_workers: 8
pin_memory: true
max_duration: 40
min_duration: 0.1
is_tarred: true
tarred_audio_filepaths: null
shuffle_n: 2048
bucketing_strategy: fully_randomized
bucketing_batch_size: null
shard_manifests: true
use_lhotse: true
use_bucketing: true
num_buckets: 30
bucket_duration_bins: null
batch_duration: 600
defer_setup: true



In [10]:
# Setup train, validation, test configs
with open_dict(cfg):
  # Train dataset
  cfg.train_ds.manifest_filepath = "DATA/train_manifest.json"
  cfg.train_ds.batch_size = 16
  cfg.train_ds.num_workers = 8
  cfg.train_ds.pin_memory = True
  cfg.train_ds.use_start_end_token = True
  cfg.train_ds.trim_silence = True

  # Validation dataset
  cfg.validation_ds.manifest_filepath = "DATA/valid_manifest.json"
  cfg.validation_ds.batch_size = 8
  cfg.validation_ds.num_workers = 8
  cfg.validation_ds.pin_memory = True
  cfg.validation_ds.use_start_end_token = True
  cfg.validation_ds.trim_silence = True

#   # Test dataset
#   cfg.test_ds.manifest_filepath = test_manifest_cleaned
#   cfg.test_ds.batch_size = 8
#   cfg.test_ds.num_workers = 8
#   cfg.test_ds.pin_memory = True
#   cfg.test_ds.use_start_end_token = True
#   cfg.test_ds.trim_silence = True

In [11]:
# setup model with new configs
model.setup_training_data(cfg.train_ds)
model.setup_multiple_validation_data(cfg.validation_ds)
# model.setup_multiple_test_data(cfg.test_ds)

[NeMo I 2025-03-13 15:11:17 dataloader:203] We will be using a Lhotse DataLoader.


[NeMo W 2025-03-13 15:11:17 dataloader:230] You are using a non-tarred dataset and requested tokenization during data sampling (pretokenize=True). This will cause the tokenization to happen in the main (GPU) process, possibly impacting the training speed if your tokenizer is very large. If the impact is noticable, set pretokenize=False in dataloader config. (note: that will disable token-per-second filtering and 2D bucketing features)


[NeMo I 2025-03-13 15:11:17 dataloader:331] Creating a Lhotse DynamicBucketingSampler (max_batch_duration=600.0 max_batch_size=16)
[NeMo I 2025-03-13 15:11:18 collections:197] Dataset loaded with 3635 files totalling 5.11 hours
[NeMo I 2025-03-13 15:11:18 collections:198] 0 files were filtered totalling 0.00 hours


In [None]:
from tqdm import tqdm

def analyse_ctc_failures_in_model(model):
    count_ctc_failures = 0
    am_seq_lengths = []
    target_seq_lengths = []

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model = model.to(device)
    mode = model.training

    train_dl = model.train_dataloader()

    with torch.no_grad():
      model = model.eval()
      for batch in tqdm(train_dl, desc='Checking for CTC failures'):
          x, x_len, y, y_len = batch
          x, x_len = x.to(device), x_len.to(device)
          x_logprobs, greedy_predictions = model(input_signal=x, input_signal_length=x_len)
          # Find how many CTC loss computation failures will occur
          for xl, yl in zip(x_len, y_len):
              if xl <= yl:
                  count_ctc_failures += 1

          # Record acoustic model lengths=
          am_seq_lengths.extend(x_len.to('cpu').numpy().tolist())

          # Record target sequence lengths
          target_seq_lengths.extend(y_len.to('cpu').numpy().tolist())

          del x, x_len, y, y_len, x_logprobs, greedy_predictions

    if mode:
      model = model.train()

    return count_ctc_failures, am_seq_lengths, target_seq_lengths

results = analyse_ctc_failures_in_model(model)

[extra info] When calling: Recording.load_audio(args=(Recording(id='common_voice_bn_31740487', sources=[AudioSource(type='file', channels=[0], source='/home/sami/workspace/nemo-asr-training/DATA/cv-corpus-20.0-2024-12-06/bn/clips/common_voice_bn_31740487.mp3')], sampling_rate=16000, num_samples=96083, duration=6.0051875, channel_ids=[0], transforms=[Resample(source_sampling_rate=32000, target_sampling_rate=16000)]),) kwargs={'channels': 0, 'offset': 0.0, 'duration': 6.00515625})
[extra info] When calling: MonoCut.load_audio(args=(MonoCut(id='common_voice_bn_31740487', start=0.0, duration=6.00515625, channel=0, supervisions=[SupervisionSegment(id='common_voice_bn_31740487', recording_id='common_voice_bn_31740487', start=0, duration=6.00515625, channel=0, text='তিনি একটি আরবি ভাষা একাডেমী প্রতিষ্ঠা, আরবি ভাষা বিভাগ স্পনসর।', language=None, speaker=None, gender=None, custom={'tokens': array([102, 179, 159, 150, 817, 937,  46, 937, 971, 938, 948, 965, 758,
       937, 979, 159, 150, 817, 9

In [None]:
num_ctc_failures, am_seq_lengths, target_seq_lengths = results

In [None]:
if num_ctc_failures > 0:
  logging.warning(f"\nCTC loss will fail for {num_ctc_failures} samples ({num_ctc_failures * 100./ float(len(am_seq_lengths))} % of samples)!\n"
                  f"Increase the vocabulary size of the tokenizer so that this number becomes close to zero !")
else:
  logging.info("No CTC failure cases !")

In [12]:
# Compute average ratio of T / U
avg_T = sum(am_seq_lengths) / float(len(am_seq_lengths))
avg_U = sum(target_seq_lengths) / float(len(target_seq_lengths))

avg_length_ratio = 0
for am_len, tgt_len in zip(am_seq_lengths, target_seq_lengths):
  avg_length_ratio += (am_len / float(tgt_len))
avg_length_ratio = avg_length_ratio / len(am_seq_lengths)

print(f"Average Acoustic model sequence length = {avg_T}")
print(f"Average Target sequence length = {avg_U}")
print()
print(f"Ratio of Average AM sequence length to target sequence length = {avg_length_ratio}")

NameError: name 'am_seq_lengths' is not defined

In [13]:
print(OmegaConf.to_yaml(cfg.optim))

name: adamw
lr: 2.0
betas:
- 0.9
- 0.98
weight_decay: 0.001
sched:
  name: NoamAnnealing
  d_model: 512
  warmup_steps: 5000
  warmup_ratio: null
  min_lr: 1.0e-06



In [14]:
with open_dict(model.cfg.optim):
  model.cfg.optim.lr = 0.025
  model.cfg.optim.weight_decay = 0.001
  model.cfg.optim.sched.warmup_steps = None  # Remove default number of steps of warmup
  model.cfg.optim.sched.warmup_ratio = 0.10  # 10 % warmup
  model.cfg.optim.sched.min_lr = 1e-9

In [15]:
with open_dict(model.cfg.spec_augment):
  model.cfg.spec_augment.freq_masks = 2
  model.cfg.spec_augment.freq_width = 25
  model.cfg.spec_augment.time_masks = 10
  model.cfg.spec_augment.time_width = 0.05

model.spec_augmentation = model.from_config_dict(model.cfg.spec_augment)

In [16]:
model.wer.use_cer = True
model.wer.log_prediction = True

In [17]:
import torch
import lightning.pytorch as ptl

if torch.cuda.is_available():
  accelerator = 'gpu'
else:
  accelerator = 'gpu'

EPOCHS = 50  # 100 epochs would provide better results

trainer = ptl.Trainer(devices=1,
                      accelerator=accelerator,
                      max_epochs=EPOCHS,
                      accumulate_grad_batches=1,
                      enable_checkpointing=False,
                      logger=False,
                      log_every_n_steps=5,
                      check_val_every_n_epoch=10)

# Setup model with the trainer
model.set_trainer(trainer)

# finally, update the model's internal config
model.cfg = model._cfg

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [18]:
from nemo.utils import exp_manager
import os

LANGUAGE = "bn"

# Environment variable generally used for multi-node multi-gpu training.
# In notebook environments, this flag is unnecessary and can cause logs of multiple training runs to overwrite each other.
os.environ.pop('NEMO_EXPM_VERSION', None)

config = exp_manager.ExpManagerConfig(
    exp_dir=f'experiments/lang-{LANGUAGE}/',
    name=f"ASR-Model-Language-{LANGUAGE}",
    checkpoint_callback_params=exp_manager.CallbackParams(
        monitor="val_wer",
        mode="min",
        always_save_nemo=True,
        save_best_model=True,
    ),
)

config = OmegaConf.structured(config)

logdir = exp_manager.exp_manager(trainer, config)

[NeMo I 2025-03-13 15:12:20 exp_manager:450] ExpManager schema
[NeMo I 2025-03-13 15:12:20 exp_manager:451] {'explicit_log_dir': None, 'exp_dir': None, 'name': None, 'version': None, 'use_datetime_version': True, 'resume_if_exists': False, 'resume_past_end': False, 'resume_ignore_no_checkpoint': False, 'resume_from_checkpoint': None, 'create_tensorboard_logger': True, 'summary_writer_kwargs': None, 'create_wandb_logger': False, 'wandb_logger_kwargs': None, 'create_mlflow_logger': False, 'mlflow_logger_kwargs': {'experiment_name': None, 'tracking_uri': None, 'tags': None, 'save_dir': './mlruns', 'prefix': '', 'artifact_location': None, 'run_id': None, 'log_model': False}, 'create_dllogger_logger': False, 'dllogger_logger_kwargs': {'verbose': False, 'stdout': False, 'json_file': './dllogger.json'}, 'create_clearml_logger': False, 'clearml_logger_kwargs': {'project': None, 'task': None, 'connect_pytorch': False, 'model_name': None, 'tags': None, 'log_model': False, 'log_cfg': False, 'log_

In [19]:
%%time
trainer.fit(model)

You are using a CUDA device ('NVIDIA A100 80GB PCIe') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[NeMo I 2025-03-13 15:12:24 modelPT:793] Optimizer config = AdamW (
    Parameter Group 0
        amsgrad: False
        betas: [0.9, 0.98]
        capturable: False
        differentiable: False
        eps: 1e-08
        foreach: None
        fused: None
        lr: 0.025
        maximize: False
        weight_decay: 0.001
    )


TypeError: object of type 'LhotseSpeechToTextBpeDataset' has no len()

## Using Finetune Script from nemo repo

In [None]:
python speech_to_text_finetune.py 