# Install Dependencies

In [1]:
# Install dependencies
!pip install wget
!apt-get install sox libsndfile1 ffmpeg libsox-fmt-mp3
!pip install unidecode
!pip install matplotlib>=3.3.2

## Install NeMo
BRANCH = 'main'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9656 sha256=bd6926499de5cf9b15fed32026a7c2041a55bb06f9769edff393523e58189ba7
  Stored in directory: /root/.cache/pip/wheels/40/b3/0f/a40dbd1c6861731779f62cc4babcb234387e11d697df70ee97
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libsndfile1 is already the newest version (1.0.31-2ubuntu0.2).
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
The following additional packages will be installed:
  libid3tag0 libmad0 libopencore-amrnb0 libopencore-amrwb0 libsox-fmt-alsa libsox-fmt-base libsox3
  libwavpack1
Suggested packages:
  libsox-fmt-all
The following NEW packa

# Finetuning QuartzNet Models On Vietnamese Language

## Import Libraries

In [1]:
import os
import glob
import json
import subprocess
import librosa
import tarfile
from tqdm.auto import tqdm
import wget
import copy
from omegaconf import OmegaConf, open_dict
import nemo
import nemo.collections.asr as nemo_asr
from nemo.collections.asr.metrics.wer import word_error_rate
from nemo.utils import logging, exp_manager

  from .autonotebook import tqdm as notebook_tqdm


## Make Data Directory

In [2]:
data_dir = '.'

## Download VIVOS Dataset

In [None]:
# Download the dataset
print("******")
if not os.path.exists(data_dir + '/vivos.tar.gz'):
    vivos_url = 'www.ailab.hcmus.edu.vn/assets/vivos.tar.gz'
    vivos_path = wget.download(vivos_url, data_dir)
    print(f"Dataset downloaded at: {vivos_path}")
else:
    print("Tarfile already exists.")
    vivos_path = data_dir + '/vivos_sphere.tar.gz'

if not os.path.exists(data_dir + '/vivos/'):
    # Untar and convert .sph to .wav (using sox)
    tar = tarfile.open(vivos_path)
    tar.extractall(path=data_dir)

    print("Converting .sph to .wav...")
    sph_list = glob.glob(data_dir + '/vivos/**/*.sph', recursive=True)
    for sph_path in sph_list:
        wav_path = sph_path[:-4] + '.wav'
        cmd = ["sox", sph_path, wav_path]
        subprocess.run(cmd)
print("Finished conversion.\n******")

******
Dataset downloaded at: ./vivos.tar.gz
Converting .sph to .wav...
Finished conversion.
******


In [3]:
LANGUAGE = 'vi'

## Preparing Dataset For Training

### Manifest Utilities

In [4]:
# Function to build a manifest
def build_manifest(transcripts_path, manifest_path, wav_path):
    with open(transcripts_path, 'r',encoding='utf8') as fin:
        with open(manifest_path, 'w', encoding='utf8') as fout:
            for line in fin:

                transcript = line[line.find(' ') : -1].lower()
                transcript = transcript.strip()

                file_id = line[: line.find(' ')]

                audio_path = os.path.join(
                    data_dir, wav_path,
                    file_id[file_id.find('V') : file_id.rfind('_')],
                    file_id + '.wav')

                duration = librosa.core.get_duration(filename=audio_path)

                # Write the metadata to the manifest
                metadata = {
                    "audio_filepath": audio_path,
                    "duration": duration,
                    "text": transcript
                    }
                json.dump(metadata, fout, ensure_ascii=False)
                fout.write('\n')

# Building Manifests
print("******")
train_transcripts = data_dir + '/vivos/train/prompts.txt'
train_manifest = data_dir + '/vivos/train_manifest.json'
if not os.path.isfile(train_manifest):
    build_manifest(train_transcripts, train_manifest, 'vivos/train/waves')
    print("Train manifest created.")

test_transcripts = data_dir + '/vivos/test/prompts.txt'
test_manifest = data_dir + '/vivos/test_manifest.json'
if not os.path.isfile(test_manifest):
    build_manifest(test_transcripts, test_manifest, 'vivos/test/waves')
    print("Test manifest created.")
print("***Done***")

******
***Done***


In [5]:
def read_manifest(path):
    manifest = []
    with open(path, 'r', encoding = 'utf8') as f:
        for line in tqdm(f, desc="Reading manifest data"):
            line = line.replace("\n", "")
            data = json.loads(line)
            manifest.append(data)
    return manifest

In [6]:
train_manifest_data = read_manifest(train_manifest)
test_manifest_data = read_manifest(test_manifest)

Reading manifest data: 11660it [00:00, 297741.83it/s]
Reading manifest data: 760it [00:00, 349180.75it/s]


In [7]:
train_text = [data['text'] for data in train_manifest_data]
test_text = [data['text'] for data in test_manifest_data]

### Character Set

Let us calculate the character set - which is the set of unique tokens that exist within the text manifests.

In [8]:
from collections import defaultdict

def get_charset(manifest_data):
    charset = defaultdict(int)
    for row in tqdm(manifest_data, desc="Computing character set"):
        text = row['text']
        for character in text:
            charset[character] += 1
    return charset

In [9]:
train_charset = get_charset(train_manifest_data)
test_charset = get_charset(test_manifest_data)

Computing character set: 100%|██████████| 11660/11660 [00:00<00:00, 336435.33it/s]
Computing character set: 100%|██████████| 760/760 [00:00<00:00, 443471.21it/s]


In [10]:
train_set = set(train_charset.keys())
test_set = set(test_charset.keys())

### Final Character Set

After pre-processing the dataset, let's recover the final character set used to train the models.

In [11]:
print(f"Number of tokens in train set : {len(train_set)}")
print(f"Number of tokens in test set : {len(test_set)}")

Number of tokens in train set : 92
Number of tokens in test set : 88


# Character Encoding CTC Model

Now that we have a processed dataset, we can begin training an ASR model on this dataset. The following section will detail how we prepare a CTC model which utilizes a Character Encoding scheme.

This section will utilize a pre-trained QuartzNet 15x5, which has been trained on roughly 7,000 hours of English speech base model. We will modify the decoder layer (thereby changing the model's vocabulary) and then train for a small number of epochs.

In [12]:
char_model = nemo_asr.models.ASRModel.from_pretrained("stt_en_quartznet15x5", map_location='cuda')

[NeMo I 2025-03-03 12:34:42 cloud:58] Found existing object /home/manh264/.cache/torch/NeMo/NeMo_2.3.0rc0/stt_en_quartznet15x5/16661021d16e679bdfd97a2a03944c49/stt_en_quartznet15x5.nemo.
[NeMo I 2025-03-03 12:34:42 cloud:64] Re-using file from: /home/manh264/.cache/torch/NeMo/NeMo_2.3.0rc0/stt_en_quartznet15x5/16661021d16e679bdfd97a2a03944c49/stt_en_quartznet15x5.nemo
[NeMo I 2025-03-03 12:34:42 common:826] Instantiating model from pre-trained checkpoint


[NeMo W 2025-03-03 12:34:43 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /data2/voices/train_1k.json
    sample_rate: 16000
    labels:
    - ' '
    - a
    - b
    - c
    - d
    - e
    - f
    - g
    - h
    - i
    - j
    - k
    - l
    - m
    - 'n'
    - o
    - p
    - q
    - r
    - s
    - t
    - u
    - v
    - w
    - x
    - 'y'
    - z
    - ''''
    batch_size: 32
    trim_silence: true
    max_duration: 16.7
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: /asr_set_1.2/train/train_{0..1023}.tar
    num_workers: 20
    
[NeMo W 2025-03-03 12:34:43 modelPT:183] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
   

[NeMo I 2025-03-03 12:34:43 features:305] PADDING: 16
[NeMo I 2025-03-03 12:34:47 save_restore_connector:275] Model EncDecCTCModel was successfully restored from /home/manh264/.cache/torch/NeMo/NeMo_2.3.0rc0/stt_en_quartznet15x5/16661021d16e679bdfd97a2a03944c49/stt_en_quartznet15x5.nemo.


## Update The Vocabulary

In [13]:
char_model.change_vocabulary(new_vocabulary=list(train_set))

[NeMo I 2025-03-03 12:34:51 ctc_models:263] Changed decoder to output to ['ị', 'ẩ', 'ằ', 'v', 'g', 'l', 'ã', ' ', 'ẹ', 'ẽ', 'ấ', '4', 'e', 'á', 'p', 'ệ', 'ỡ', 'o', 'à', 'i', 'ư', 'h', 'ế', 'ẻ', 'ầ', 'ỉ', 'ì', 'ỏ', 'r', 'n', 'ớ', 'ọ', 'ụ', 'ê', 'ó', 'ũ', 'ở', 'ỷ', 'ố', 'm', ':', 'ơ', 'ứ', 'ạ', 'ẳ', 'ô', 'ả', 'ậ', 'ă', 'đ', 'd', 'b', 'x', 'è', 'í', 'u', 'ể', 'ổ', 't', 's', 'c', 'ử', 'ò', 'ý', 'k', 'ễ', 'ẫ', 'q', 'ữ', 'ỵ', 'ộ', 'ỳ', 'ặ', 'õ', 'ỹ', 'y', 'ừ', 'ề', 'ẵ', 'ủ', 'ắ', 'ự', 'ồ', 'ỗ', 'a', 'â', 'ù', 'é', 'ợ', 'ú', 'ĩ', 'ờ'] vocabulary.


## Training On Low Resource Languages

If the amount of training data or available computational resources are limited, it might be useful to freeze the encoder module of the network and train just the final decoder layer. This is also useful in cases where GPU memory is insufficient to train a large network, or cases where the model might overfit due to its size.

In cases where sufficient data is available - and "sufficient" is dependent on the complexity of the language - then it is advised to train the encoder as well to get the best possible transcript. When we say sufficient is relative to the language, we have noticed that some languages can obtain reasonable scores with a few hundred hours of transcribed speech, whereas some languages require several thousand hours.

It is also important to note that if the language remains the same, and some specific domain of text must be adapted for ASR, it is often easier to add a domain-specific language model to guide the generic ASR model than to attempt fine-tuning a full ASR model on limited data from that specific domain

In [14]:
#@title Freeze Encoder { display-mode: "form" }
freeze_encoder = False #@param ["False", "True"] {type:"raw"}
freeze_encoder = bool(freeze_encoder)

In [15]:
import torch
import torch.nn as nn

def enable_bn_se(m):
    if type(m) == nn.BatchNorm1d:
        m.train()
        for param in m.parameters():
            param.requires_grad_(True)

    if 'SqueezeExcite' in type(m).__name__:
        m.train()
        for param in m.parameters():
            param.requires_grad_(True)

In [16]:
if freeze_encoder:
  char_model.encoder.freeze()
  char_model.encoder.apply(enable_bn_se)
  logging.info("Model encoder has been frozen, and batch normalization has been unfrozen")
else:
  char_model.encoder.unfreeze()
  logging.info("Model encoder has been un-frozen")

[NeMo I 2025-03-03 12:35:00 705968631:7] Model encoder has been un-frozen


## Update Config

Each NeMo model has a config embedded in it, which can be accessed via `model.cfg`. In general, this is the config that was used to construct the model.

For pre-trained models, this config generally represents the config used to construct the model when it was trained. A nice benefit to this embedded config is that we can repurpose it to set up new data loaders, optimizers, schedulers, and even data augmentation!

### Updating The Character Set Of The Model

In [17]:
char_model.cfg.labels = list(train_set)

In [18]:
cfg = copy.deepcopy(char_model.cfg)

### Setting Up Data Loaders

In [19]:
# Setup train, validation, test configs
with open_dict(cfg):
  # Train dataset  (Concatenate train manifest cleaned and dev manifest cleaned)
  cfg.train_ds.manifest_filepath = f"{train_manifest}"
  cfg.train_ds.labels = list(train_set)
  cfg.train_ds.normalize_transcripts = False
  cfg.train_ds.batch_size = 32
  cfg.train_ds.num_workers = 8
  cfg.train_ds.pin_memory = True
  cfg.train_ds.trim_silence = True

  # Validation dataset  (Use test dataset as validation, since we train using train + dev)
  cfg.validation_ds.manifest_filepath = test_manifest
  cfg.validation_ds.labels = list(train_set)
  cfg.validation_ds.normalize_transcripts = False
  cfg.validation_ds.batch_size = 8
  cfg.validation_ds.num_workers = 8
  cfg.validation_ds.pin_memory = True
  cfg.validation_ds.trim_silence = True

In [20]:
# setup data loaders with new configs
char_model.setup_training_data(cfg.train_ds)
char_model.setup_multiple_validation_data(cfg.validation_ds)

[NeMo I 2025-03-03 12:35:10 collections:201] Dataset loaded with 11657 files totalling 14.91 hours
[NeMo I 2025-03-03 12:35:10 collections:202] 3 files were filtered totalling 0.01 hours
[NeMo I 2025-03-03 12:35:10 collections:201] Dataset loaded with 760 files totalling 0.75 hours
[NeMo I 2025-03-03 12:35:10 collections:202] 0 files were filtered totalling 0.00 hours


### Setting Up Optimizer And Scheduler

In [21]:
# Original optimizer + scheduler
print(OmegaConf.to_yaml(char_model.cfg.optim))

name: novograd
lr: 0.01
betas:
- 0.8
- 0.5
weight_decay: 0.001
sched:
  name: CosineAnnealing
  warmup_steps: null
  warmup_ratio: null
  min_lr: 0.0
  last_epoch: -1



In [22]:
with open_dict(char_model.cfg.optim):
  char_model.cfg.optim.lr = 5e-5
  char_model.cfg.optim.betas = [0.95, 0.5]  # from paper
  char_model.cfg.optim.weight_decay = 0.001  # Original weight decay
  char_model.cfg.optim.sched.warmup_steps = None  # Remove default number of steps of warmup
  char_model.cfg.optim.sched.warmup_ratio = None
  char_model.cfg.optim.sched.min_lr = 0.0

### Setting Up Augmentation

Remember that the model was trained on several thousands of hours of data, so the regularization provided to it might not suit the current dataset. We can easily change it as we see fit.

You might notice that we utilize `char_model.from_config_dict()` to create a new SpectrogramAugmentation object and assign it directly in place of the previous augmentation. This is generally the syntax to be followed whenever you notice a `_target_` tag in the config of a model's inner config.

**Note:** For low resource languages, it might be better to increase augmentation via SpecAugment to reduce overfitting. However, this might, in turn, make it too hard for the model to train in a short number of epochs.

In [23]:
print(OmegaConf.to_yaml(char_model.cfg.spec_augment))

_target_: nemo.collections.asr.modules.SpectrogramAugmentation
rect_freq: 50
rect_masks: 5
rect_time: 120



In [24]:
# with open_dict(char_model.cfg.spec_augment):
#   char_model.cfg.spec_augment.freq_masks = 2
#   char_model.cfg.spec_augment.freq_width = 25
#   char_model.cfg.spec_augment.time_masks = 2
#   char_model.cfg.spec_augment.time_width = 0.05

char_model.spec_augmentation = char_model.from_config_dict(char_model.cfg.spec_augment)

## Setup Metrics

In [25]:
#@title Metric
use_cer = True #@param ["False", "True"] {type:"raw"}
log_prediction = True #@param ["False", "True"] {type:"raw"}

In [27]:
char_model.wer.use_cer = use_cer
char_model.wer.log_prediction = log_prediction

## Setup Trainer And Experiment Manager

In [None]:
import torch
import lightning.pytorch as ptl

if torch.cuda.is_available():
  accelerator = 'gpu'
else:
  accelerator = 'cpu'

EPOCHS = 1000  # 100 epochs would provide better results, but would take an hour to train

trainer = ptl.Trainer(devices=1,
                      accelerator=accelerator,
                      max_epochs=EPOCHS,
                      accumulate_grad_batches=1,
                      enable_checkpointing=False,
                      logger=False,
                      log_every_n_steps=5,
                      check_val_every_n_epoch=10)

# Setup model with the trainer
char_model.set_trainer(trainer)

# Finally, update the model's internal config
char_model.cfg = char_model._cfg

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
[NeMo W 2025-03-03 12:35:34 modelPT:1567] World size can only be set by PyTorch Lightning Trainer.


In [29]:
# Environment variable generally used for multi-node multi-gpu training.
# In notebook environments, this flag is unnecessary and can cause logs of multiple training runs to overwrite each other.
# os.environ.pop('NEMO_EXPM_VERSION', None)

config = exp_manager.ExpManagerConfig(
    exp_dir=f'experiments/lang-{LANGUAGE}/',
    name=f"ASR-Char-Model-Language-{LANGUAGE}",
    checkpoint_callback_params=exp_manager.CallbackParams(
        monitor="val_wer",
        mode="min",
        always_save_nemo=True,
        save_best_model=True,
    ),
)

config = OmegaConf.structured(config)

logdir = exp_manager.exp_manager(trainer, config)

[NeMo I 2025-03-03 12:35:37 exp_manager:469] ExpManager schema
[NeMo I 2025-03-03 12:35:37 exp_manager:470] {'explicit_log_dir': None, 'exp_dir': None, 'name': None, 'version': None, 'use_datetime_version': True, 'resume_if_exists': False, 'resume_past_end': False, 'resume_ignore_no_checkpoint': False, 'resume_from_checkpoint': None, 'create_tensorboard_logger': True, 'summary_writer_kwargs': None, 'create_wandb_logger': False, 'wandb_logger_kwargs': None, 'create_mlflow_logger': False, 'mlflow_logger_kwargs': {'experiment_name': None, 'tracking_uri': None, 'tags': None, 'save_dir': './mlruns', 'prefix': '', 'artifact_location': None, 'run_id': None, 'log_model': False}, 'create_dllogger_logger': False, 'dllogger_logger_kwargs': {'verbose': False, 'stdout': False, 'json_file': './dllogger.json'}, 'create_clearml_logger': False, 'clearml_logger_kwargs': {'project': None, 'task': None, 'connect_pytorch': False, 'model_name': None, 'tags': None, 'log_model': False, 'log_cfg': False, 'log_

    


[NeMo I 2025-03-03 12:35:37 exp_manager:665] TFLOPs per sec per GPU will be calculated, conditioned on supported models. Defaults to -1 upon failure.


# Let's Train !!!

In [30]:
%%time
trainer.fit(char_model)

TypeError: `model` must be a `LightningModule` or `torch._dynamo.OptimizedModule`, got `EncDecCTCModel`

# Saving And Loading Model

## Save The Final Model

In [None]:
char_model.save_to('quartznet.nemo')

## Load The Model

In [None]:
quartznet = nemo_asr.models.EncDecCTCModel.restore_from('quartznet.nemo')

# Conclusion

This tutorial discussed the generic steps to prepare a dataset in a different language, prepared QuartzNet model for fine-tuning, and discussed some additional insights for fine-tuning CTC-based models.

While the focus was on a small dataset for Vietnamese, nearly all of this information can be used for larger datasets and other scenarios where compute is limited, or the model's size prevents fine-tuning the entire model.