In [2]:
import torch
import os
from omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer

import nemo.collections.asr as nemo_asr
from pathlib import Path


In [3]:
# Load the model from a local .nemo file

MODEL_ROOT = "/external2/models/hf/stt_en_conformer_ctc_large/"
MODEL_ROOT = Path(MODEL_ROOT)
model_path = MODEL_ROOT / "stt_en_conformer_ctc_large.nemo"

cfg = nemo_asr.models.ASRModel.restore_from(restore_path=model_path, return_config=True)

# If you need to get the configuration, you can access it from the loaded model

print(cfg)

{'sample_rate': 16000, 'log_prediction': True, 'ctc_reduction': 'mean_batch', 'train_ds': {'manifest_filepath': '/data/NeMo_ASR_SET/English/v2.0/train/tarred_audio_manifest.json', 'sample_rate': 16000, 'batch_size': 32, 'shuffle': True, 'num_workers': 8, 'pin_memory': True, 'use_start_end_token': False, 'trim_silence': False, 'max_duration': 20.0, 'min_duration': 0.1, 'shuffle_n': 2048, 'is_tarred': True, 'tarred_audio_filepaths': '/data/NeMo_ASR_SET/English/v2.0/train/audio__OP_0..4095_CL_.tar'}, 'validation_ds': {'manifest_filepath': ['/data/ASR/LibriSpeech/librispeech_withsp2/manifests/librivox-dev-other.json', '/data/ASR/LibriSpeech/librispeech_withsp2/manifests/librivox-dev-clean.json', '/data/ASR/LibriSpeech/librispeech_withsp2/manifests/librivox-test-other.json', '/data/ASR/LibriSpeech/librispeech_withsp2/manifests/librivox-test-clean.json'], 'sample_rate': 16000, 'batch_size': 16, 'shuffle': False, 'num_workers': 8, 'pin_memory': True, 'use_start_end_token': False, 'is_tarred':

In [4]:
# from nemo.core import adapter_mixins

# # Utility method to check and update the model config
# def update_model_config_to_support_adapter(model_cfg):
#     with open_dict(model_cfg):
#         adapter_metadata = adapter_mixins.get_registered_adapter(model_cfg.encoder._target_)
#         if adapter_metadata is not None:
#             model_cfg.encoder._target_ = adapter_metadata.adapter_class_path

#     print("Updated encoder _target_ model :", model_cfg.encoder._target_)
#     return model_cfg

In [5]:
#cfg = update_model_config_to_support_adapter(cfg)

In [6]:
model = nemo_asr.models.ASRModel.restore_from(model_path, override_config_path=cfg)

[NeMo I 2024-07-18 00:18:48 mixins:172] Tokenizer SentencePieceTokenizer initialized with 128 tokens


[NeMo W 2024-07-18 00:18:48 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /data/NeMo_ASR_SET/English/v2.0/train/tarred_audio_manifest.json
    sample_rate: 16000
    batch_size: 32
    shuffle: true
    num_workers: 8
    pin_memory: true
    use_start_end_token: false
    trim_silence: false
    max_duration: 20.0
    min_duration: 0.1
    shuffle_n: 2048
    is_tarred: true
    tarred_audio_filepaths: /data/NeMo_ASR_SET/English/v2.0/train/audio__OP_0..4095_CL_.tar
    
[NeMo W 2024-07-18 00:18:48 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath:
    - /data/ASR/LibriSpeech/librispeech_withs

[NeMo I 2024-07-18 00:18:48 features:289] PADDING: 0
[NeMo I 2024-07-18 00:18:50 save_restore_connector:249] Model EncDecCTCModelBPE was successfully restored from /external2/models/hf/stt_en_conformer_ctc_large/stt_en_conformer_ctc_large.nemo.


In [7]:
data_dir = "/home/ksingla/workspace/PromptingNemo/data_v2/synthetic/processed/"
TRAIN_MANIFEST = os.path.join(data_dir, "train.json")
TEST_MANIFEST = os.path.join(data_dir, "valid.json")
ALL_TAGS = os.path.join(data_dir, "alltags_uniq.txt")
taglist = []
with open(ALL_TAGS, 'r') as f:
    for line in f:
        word, tag = line.split()
        taglist.append(tag)

In [8]:
import os
import sentencepiece as spm
import json
import logging

def train_sentencepiece_tokenizer(manifest_file, tokenizer_folder, special_tokens=None, vocab_size=5000):
    # Configure logging
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

    logging.info("Starting the tokenizer training process")

    # Step 1: Read the manifest file and extract text data
    def read_manifest(manifest_path):
        with open(manifest_path, 'r') as f:
            lines = f.readlines()
        return [json.loads(line)['text'] for line in lines]
    
    logging.info("Reading manifest file")
    text_data = read_manifest(manifest_file)
    logging.info(f"Extracted {len(text_data)} sentences from the manifest file")
    
    # Step 2: Save the extracted text to a temporary file
    if not os.path.exists(tokenizer_folder):
        os.makedirs(tokenizer_folder)
    
    temp_text_file = os.path.join(tokenizer_folder, 'text_data.txt')
    logging.info(f"Saving extracted text to {temp_text_file}")
    with open(temp_text_file, 'w') as f:
        for sentence in text_data:
            f.write(sentence + '\n')
    
    # Step 3: Train the SentencePiece tokenizer with special tokens if provided
    model_prefix = os.path.join(tokenizer_folder, 'tokenizer')
    
    # Prepare special tokens string
    if special_tokens:
        user_defined_symbols = ','.join(special_tokens)
        logging.info(f"Special tokens provided: {special_tokens}")
        logging.info("Starting SentencePiece training with special tokens")
        spm.SentencePieceTrainer.train(
            input=temp_text_file, 
            model_prefix=model_prefix, 
            vocab_size=vocab_size,
            user_defined_symbols=user_defined_symbols
        )
    else:
        logging.info("Starting SentencePiece training without special tokens")
        spm.SentencePieceTrainer.train(
            input=temp_text_file, 
            model_prefix=model_prefix, 
            vocab_size=vocab_size
        )
    
    # Step 4: Return the paths to the tokenizer model and vocab files
    model_file = f"{model_prefix}.model"
    vocab_file = f"{model_prefix}.vocab"

    logging.info(f"Tokenizer training completed")
    logging.info(f"Model file: {model_file}")
    logging.info(f"Vocab file: {vocab_file}")

    # Step 5: Create a vocab.txt file
    vocab_txt_file = os.path.join(tokenizer_folder, 'vocab.txt')
    logging.info(f"Creating vocab.txt file at {vocab_txt_file}")
    with open(vocab_file, 'r') as vf, open(vocab_txt_file, 'w') as vtf:
        for line in vf:
            token = line.split('\t')[0]
            vtf.write(token + '\n')
    
    logging.info(f"vocab.txt file created at {vocab_txt_file}")
    
    return model_file, vocab_file, vocab_txt_file

In [9]:
taglist = open(ALL_TAGS, 'r').read().splitlines()
taglist = [tag.split()[1] for tag in taglist]


In [11]:
train_sentencepiece_tokenizer(TRAIN_MANIFEST, "/home/ksingla/workspace/PromptingNemo/data_v2/synthetic/processed/tokenizer", special_tokens=taglist, vocab_size=1600)

Starting the tokenizer training process
Reading manifest file
Extracted 110000 sentences from the manifest file
Saving extracted text to /home/ksingla/workspace/PromptingNemo/data_v2/synthetic/processed/tokenizer/text_data.txt
Special tokens provided: ['T0', 'T1', 'T2', 'T3', 'T4', 'T6', 'T7', 'T8', 'T9', 'T10', 'T11', 'T12', 'T13', 'T14', 'T15', 'T16', 'T17', 'T18', 'T19', 'T20', 'T21', 'T22', 'T23', 'T24', 'T25', 'T26', 'T27', 'T28', 'T29', 'T30', 'T31', 'T32', 'T33', 'T34', 'T35', 'T36', 'T37', 'T38', 'T39', 'T40', 'T41', 'T42', 'T43', 'T44', 'T45', 'T46', 'T47', 'T48', 'T49', 'T50', 'T51', 'T52', 'T53', 'T54', 'T55', 'T56', 'T57', 'T58', 'T59', 'T60', 'T61', 'T62', 'T63', 'T64', 'T65', 'T66', 'T67', 'T68', 'T69', 'T70', 'T71', 'T72', 'T73', 'T74', 'T75', 'T76', 'T77', 'T78', 'T79', 'T80', 'T81', 'T82', 'T83', 'T84', 'T85', 'T86', 'T87', 'T88', 'T89', 'T90', 'T91', 'T92', 'T93', 'T94', 'T95', 'T96', 'T97', 'T98', 'T99', 'T100', 'T101', 'T102', 'T103', 'T104', 'T105', 'T106', 'T107',

('/home/ksingla/workspace/PromptingNemo/data_v2/synthetic/processed/tokenizer/tokenizer.model',
 '/home/ksingla/workspace/PromptingNemo/data_v2/synthetic/processed/tokenizer/tokenizer.vocab',
 '/home/ksingla/workspace/PromptingNemo/data_v2/synthetic/processed/tokenizer/vocab.txt')

In [12]:
import subprocess
import os
import sentencepiece as spm
import logging
import sys
import json

def generate_sentencepiece_model_pb2(script_dir, proto_file_path):
    # Construct the command
    command = [
        'protoc',
        f'--python_out={script_dir}',
        proto_file_path
    ]
    
    try:
        # Run the command
        subprocess.run(command, check=True)
        print("Successfully generated sentencepiece_model_pb2.py")
    except subprocess.CalledProcessError as e:
        print(f"Error generating sentencepiece_model_pb2.py: {e}")
        sys.exit(1)

def edit_spt_model(input_file, output_folder, tokens, vocab_file, vocab_txt_file, is_userdefined=False):
    from sentencepiece_model_pb2 import ModelProto  # Ensure this import is after the proto generation
    
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    output_model_file = os.path.join(output_folder, 'tokenizer.model')
    output_vocab_file = os.path.join(output_folder, 'tokenizer.vocab')
    output_vocab_txt_file = os.path.join(output_folder, 'vocab.txt')

    token_type = 3
    if is_userdefined:
        token_type = 4

    model = ModelProto()
    model.ParseFromString(open(input_file, 'rb').read())

    existing_tokens = {piece.piece for piece in model.pieces}

    new_tokens = []
    for token in tokens:
        if token in existing_tokens:
            logging.warning(f"Special Token '{token}' already exists in the input model, skipping.")
            continue
        piece = model.SentencePiece(piece=token, score=0.0, type=token_type)
        model.pieces.append(piece)
        new_tokens.append(token)

    sp = spm.SentencePieceProcessor()
    try:
        sp.LoadFromSerializedProto(model.SerializeToString())
        for token in new_tokens:
            id = sp.piece_to_id(token)
            logging.info(f"Created token '{token}' at ID {id}")
        logging.info(f"New tokenizer vocab size: {sp.get_piece_size()}")
    except:
        logging.error("Could not appropriately configure new tokenizer. Verify if the special tokens already exist.")
        sys.exit(1)

    with open(output_model_file, 'wb') as outf:
        outf.write(model.SerializeToString())

    logging.info(f"Created new tokenizer at: {output_model_file}")

    # Read the original vocab file and append the new tokens
    with open(vocab_file, 'r') as original_vocab_file:
        original_vocab = original_vocab_file.readlines()

    with open(output_vocab_file, 'w') as updated_vocab_file:
        updated_vocab_file.writelines(original_vocab)
        for token in new_tokens:
            updated_vocab_file.write(f"{token}\n")

    # Update vocab.txt
    with open(vocab_txt_file, 'r') as original_vocab_txt_file:
        original_vocab_txt = original_vocab_txt_file.readlines()

    with open(output_vocab_txt_file, 'w') as updated_vocab_txt_file:
        updated_vocab_txt_file.writelines(original_vocab_txt)
        for token in new_tokens:
            updated_vocab_txt_file.write(f"{token}\n")

    logging.info(f"Updated vocab files: {output_vocab_file}, {output_vocab_txt_file}")

def update_model_config(model, new_model_path):
    model['cfg']['tokenizer']['model_path'] = new_model_path
    logging.info(f"Updated model configuration with new tokenizer model path: {new_model_path}")


# # Define input and output paths
# input_folder = MODEL_ROOT / "tokenizer"
# output_folder = MODEL_ROOT / "new_tokenizer"


# #input_folder = '/external/ksingla/models/nemo/stt_en_conformer_ctc_small/tokenizer'
# #output_folder = '/external/ksingla/models/nemo/stt_en_conformer_ctc_small/new_tokenizer'
# #proto_dir = '/path/to/save/proto'  # Define the actual path where the proto file should be saved
# #proto_file = '/path/to/sentencepiece_model.proto'  # Define the actual path to the sentencepiece_model.proto file

# input_file = input_folder / 'tokenizer.model'
# vocab_file = input_folder / 'tokenizer.vocab'
# vocab_txt_file = input_folder / 'vocab.txt'

# # input_file = os.path.join(input_folder, 'tokenizer.model')
# # vocab_file = os.path.join(input_folder, 'tokenizer.vocab')
# # vocab_txt_file = os.path.join(input_folder, 'vocab.txt')

# # Include all single-digit integers in the tokens list
# punctuations = ['.', ',', '?', '!', ';', ':', '-', '(', ')', '[', ']', '{', '}', '<', '>', '/', '\\', '|', '@', '#', '$', '%', '^', '&', '*', '+', '=', '~', '`', '_', '"', "'"]
# tokens = taglist + [str(i) for i in range(10)] + punctuations
# is_userdefined = True

# # Step 1: Generate the sentencepiece_model_pb2.py file
# #generate_sentencepiece_model_pb2(proto_dir, proto_file)

# # Step 2: Edit the SentencePiece model
# edit_spt_model(input_file, output_folder, tokens, vocab_file, vocab_txt_file, is_userdefined)

# Step 3: Load the model configuration and update it
# model_config_file = '/path/to/model/config.json'  # Define the actual path to the model config file
# with open(model_config_file, 'r') as f:
#     model = json.load(f)

# new_model_path = os.path.join(output_folder, 'tokenizer.model')
# update_model_config(model, new_model_path)

# # Save the updated model configuration
# with open(model_config_file, 'w') as f:
#     json.dump(model, f, indent=4)

#logging.info(f"Updated model configuration saved to: {model_config_file}")


In [13]:
model.change_vocabulary("/home/ksingla/workspace/PromptingNemo/data_v2/synthetic/processed/tokenizer/", "bpe")

[NeMo W 2024-07-18 00:20:47 modelPT:258] You tried to register an artifact under config key=tokenizer.model_path but an artifact for it has already been registered.
[NeMo W 2024-07-18 00:20:47 modelPT:258] You tried to register an artifact under config key=tokenizer.vocab_path but an artifact for it has already been registered.
[NeMo W 2024-07-18 00:20:47 modelPT:258] You tried to register an artifact under config key=tokenizer.spe_tokenizer_vocab but an artifact for it has already been registered.


[NeMo I 2024-07-18 00:20:47 mixins:172] Tokenizer SentencePieceTokenizer initialized with 1600 tokens
[NeMo I 2024-07-18 00:20:47 ctc_bpe_models:248] 
    Replacing old number of classes (128) with new number of classes - 1600
[NeMo I 2024-07-18 00:20:47 ctc_bpe_models:290] Changed tokenizer to ['<unk>', '<s>', '</s>', 'T0', 'T1', 'T2', 'T3', 'T4', 'T6', 'T7', 'T8', 'T9', 'T10', 'T11', 'T12', 'T13', 'T14', 'T15', 'T16', 'T17', 'T18', 'T19', 'T20', 'T21', 'T22', 'T23', 'T24', 'T25', 'T26', 'T27', 'T28', 'T29', 'T30', 'T31', 'T32', 'T33', 'T34', 'T35', 'T36', 'T37', 'T38', 'T39', 'T40', 'T41', 'T42', 'T43', 'T44', 'T45', 'T46', 'T47', 'T48', 'T49', 'T50', 'T51', 'T52', 'T53', 'T54', 'T55', 'T56', 'T57', 'T58', 'T59', 'T60', 'T61', 'T62', 'T63', 'T64', 'T65', 'T66', 'T67', 'T68', 'T69', 'T70', 'T71', 'T72', 'T73', 'T74', 'T75', 'T76', 'T77', 'T78', 'T79', 'T80', 'T81', 'T82', 'T83', 'T84', 'T85', 'T86', 'T87', 'T88', 'T89', 'T90', 'T91', 'T92', 'T93', 'T94', 'T95', 'T96', 'T97', 'T98', 'T

In [14]:
accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'
max_steps = 600000

trainer = Trainer(devices=1, accelerator=accelerator, max_steps=max_steps,
                  enable_checkpointing=False, logger=False,
                  log_every_n_steps=50, check_val_every_n_epoch=1, accumulate_grad_batches=8)

model.set_trainer(trainer)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [15]:
# utility method
import json
from nemo.collections.asr.parts.utils.manifest_utils import read_manifest

In [16]:
with open_dict(model.cfg):
  # Train Dataloader
  model.cfg.train_ds.manifest_filepath = TRAIN_MANIFEST
  model.cfg.train_ds.batch_size = 12
  model.cfg.train_ds.is_tarred = False
  model.cfg.train_ds.tarred_audio_filepaths = None
  model.cfg.train_ds.num_workers = 8  # Adding num_workers for training dataloader

  model.cfg.validation_ds.manifest_filepath = TEST_MANIFEST
  model.cfg.validation_ds.batch_size = 12
  model.cfg.validation_ds.num_workers = 8  # Adding num_workers for training dataloader

model.setup_training_data(model.cfg.train_ds)
model.setup_multiple_validation_data(model.cfg.validation_ds)
model.setup_multiple_test_data(model.cfg.validation_ds)

[NeMo I 2024-07-18 00:20:58 collections:196] Dataset loaded with 110000 files totalling 146.67 hours
[NeMo I 2024-07-18 00:20:58 collections:197] 0 files were filtered totalling 0.00 hours
[NeMo I 2024-07-18 00:20:58 collections:196] Dataset loaded with 4474 files totalling 5.94 hours
[NeMo I 2024-07-18 00:20:58 collections:197] 0 files were filtered totalling 0.00 hours
[NeMo I 2024-07-18 00:20:58 collections:196] Dataset loaded with 4474 files totalling 5.94 hours
[NeMo I 2024-07-18 00:20:58 collections:197] 0 files were filtered totalling 0.00 hours


In [17]:
with open_dict(model.cfg):
  # Spec Augment
  model.cfg.spec_augment.freq_masks = model.cfg.spec_augment.freq_masks  # Can be changed
  model.cfg.spec_augment.freq_width = model.cfg.spec_augment.freq_width  # Can be changed
  model.cfg.spec_augment.time_masks = model.cfg.spec_augment.time_masks  # Can be changed
  model.cfg.spec_augment.time_width = model.cfg.spec_augment.time_width  # Can be changed

model.spec_augmentation = model.from_config_dict(model.cfg.spec_augment)

In [18]:
if 'optim' in model.cfg:
  print(OmegaConf.to_yaml(model.cfg.optim))

name: adamw
lr: 2.0
betas:
- 0.9
- 0.98
weight_decay: 0.001
sched:
  name: NoamAnnealing
  d_model: 512
  warmup_steps: 10000
  warmup_ratio: null
  min_lr: 1.0e-06



In [19]:
with open_dict(model.cfg):
  model.cfg.optim.lr = 0.1
  model.cfg.optim.weight_decay = 0.0001
  model.cfg.optim.sched.warmup_steps = 1000

model.setup_optimization(model.cfg.optim);

[NeMo I 2024-07-18 00:21:00 modelPT:723] Optimizer config = AdamW (
    Parameter Group 0
        amsgrad: False
        betas: [0.9, 0.98]
        capturable: False
        differentiable: False
        eps: 1e-08
        foreach: None
        fused: None
        lr: 0.1
        maximize: False
        weight_decay: 0.0001
    )
[NeMo I 2024-07-18 00:21:00 lr_scheduler:915] Scheduler "<nemo.core.optim.lr_scheduler.NoamAnnealing object at 0x712322e4fb20>" 
    will be used during training (effective maximum steps = 600000) - 
    Parameters : 
    (d_model: 512
    warmup_steps: 1000
    warmup_ratio: null
    min_lr: 1.0e-06
    max_steps: 600000
    )


In [None]:
# if hasattr(model, 'adapter_module_names'):
#   print(model.adapter_module_names)

In [None]:
# for module in model.children():
#   if hasattr(module, 'get_accepted_adapter_types'):
#     types = module.get_accepted_adapter_types()
#     print("Module : ", module.__class__.__name__)

#     for tp in types:
#       print(tp)
#     print()

In [None]:
# from nemo.collections.common.parts.adapter_modules import LinearAdapterConfig

In [None]:

# adapter_name = "AN4" #@param {type:"string"}
# adapter_dim = 32 #@param {type:"integer"}
# adapter_activation = "swish" #@param {type:"string"}
# adapter_norm_position = "pre" #@param ["pre", "post"]

In [None]:
# adapter_cfg = LinearAdapterConfig(
#     in_features=model.cfg.encoder.d_model,  # conformer specific model dim. Every layer emits this dim at its output.
#     dim=adapter_dim,  # the bottleneck dimension of the adapter
#     activation=adapter_activation,  # activation used in bottleneck block
#     norm_position=adapter_norm_position,  # whether to use LayerNorm at the beginning or the end of the adapter
# )
# print(adapter_cfg)

In [20]:
model.summarize()

  | Name              | Type                              | Params
------------------------------------------------------------------------
0 | preprocessor      | AudioToMelSpectrogramPreprocessor | 0     
1 | encoder           | ConformerEncoder                  | 121 M 
2 | spec_augmentation | SpectrogramAugmentation           | 0     
3 | wer               | WER                               | 0     
4 | decoder           | ConvASRDecoder                    | 821 K 
5 | loss              | CTCLoss                           | 0     
------------------------------------------------------------------------
122 M     Trainable params
0         Non-trainable params
122 M     Total params
489.026   Total estimated model params size (MB)

In [None]:
# model.add_adapter(name=adapter_name, cfg=adapter_cfg)

In [21]:
model.summarize()

  | Name              | Type                              | Params
------------------------------------------------------------------------
0 | preprocessor      | AudioToMelSpectrogramPreprocessor | 0     
1 | encoder           | ConformerEncoder                  | 121 M 
2 | spec_augmentation | SpectrogramAugmentation           | 0     
3 | wer               | WER                               | 0     
4 | decoder           | ConvASRDecoder                    | 821 K 
5 | loss              | CTCLoss                           | 0     
------------------------------------------------------------------------
122 M     Trainable params
0         Non-trainable params
122 M     Total params
489.026   Total estimated model params size (MB)

In [None]:
# model.set_enabled_adapters(enabled=False)  # disable all adapters
# model.set_enabled_adapters(name=adapter_name, enabled=True)  # enable only the current adapter we want to train

In [None]:
# model.freeze()
# model.unfreeze_enabled_adapters()
# #model.unfreeze()
# model.decoder.unfreeze()

In [22]:
model.summarize()

  | Name              | Type                              | Params
------------------------------------------------------------------------
0 | preprocessor      | AudioToMelSpectrogramPreprocessor | 0     
1 | encoder           | ConformerEncoder                  | 121 M 
2 | spec_augmentation | SpectrogramAugmentation           | 0     
3 | wer               | WER                               | 0     
4 | decoder           | ConvASRDecoder                    | 821 K 
5 | loss              | CTCLoss                           | 0     
------------------------------------------------------------------------
122 M     Trainable params
0         Non-trainable params
122 M     Total params
489.026   Total estimated model params size (MB)

In [23]:
# Prepare NeMo's Experiment manager to handle checkpoint saving and logging for us
from nemo.utils import exp_manager

# Environment variable generally used for multi-node multi-gpu training.
# In notebook environments, this flag is unnecessary and can cause logs of multiple training runs to overwrite each other.
os.environ.pop('NEMO_EXPM_VERSION', None)

exp_config = exp_manager.ExpManagerConfig(
    exp_dir=f'/external2/karan_exp/experiments/',
    name=f"finetune-multidomain-alllang-100k-ctc",
    checkpoint_callback_params=exp_manager.CallbackParams(
        monitor="val_wer",
        mode="min",
        always_save_nemo=True,
        save_best_model=True,
    ),
)

exp_config = OmegaConf.structured(exp_config)

logdir = exp_manager.exp_manager(trainer, exp_config)

[NeMo I 2024-07-18 00:21:12 exp_manager:396] Experiments will be logged at /external2/karan_exp/experiments/finetune-multidomain-alllang-100k-ctc/2024-07-18_00-21-12
[NeMo I 2024-07-18 00:21:12 exp_manager:842] TensorboardLogger has been set up


[NeMo W 2024-07-18 00:21:12 exp_manager:952] The checkpoint callback was told to monitor a validation value and trainer's max_steps was set to 600000. Please ensure that max_steps will run for at least 1 epochs to ensure that checkpointing will not error out.


In [24]:
model.cfg['tokenizer']



{'dir': '/home/ksingla/workspace/PromptingNemo/data_v2/synthetic/processed/tokenizer/', 'type': 'bpe', 'model_path': '/home/ksingla/workspace/PromptingNemo/data_v2/synthetic/processed/tokenizer/tokenizer.model', 'vocab_path': '/home/ksingla/workspace/PromptingNemo/data_v2/synthetic/processed/tokenizer/vocab.txt', 'spe_tokenizer_vocab': '/home/ksingla/workspace/PromptingNemo/data_v2/synthetic/processed/tokenizer/tokenizer.vocab'}

In [25]:
model.summarize()

  | Name              | Type                              | Params
------------------------------------------------------------------------
0 | preprocessor      | AudioToMelSpectrogramPreprocessor | 0     
1 | encoder           | ConformerEncoder                  | 121 M 
2 | spec_augmentation | SpectrogramAugmentation           | 0     
3 | wer               | WER                               | 0     
4 | decoder           | ConvASRDecoder                    | 821 K 
5 | loss              | CTCLoss                           | 0     
------------------------------------------------------------------------
122 M     Trainable params
0         Non-trainable params
122 M     Total params
489.026   Total estimated model params size (MB)

In [26]:
# Finally, train the adapters
trainer.fit(model)

You are using a CUDA device ('NVIDIA L4') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[NeMo I 2024-07-18 00:21:18 modelPT:723] Optimizer config = AdamW (
    Parameter Group 0
        amsgrad: False
        betas: [0.9, 0.98]
        capturable: False
        differentiable: False
        eps: 1e-08
        foreach: None
        fused: None
        lr: 0.1
        maximize: False
        weight_decay: 0.0001
    )
[NeMo I 2024-07-18 00:21:18 lr_scheduler:915] Scheduler "<nemo.core.optim.lr_scheduler.NoamAnnealing object at 0x7122ecd7eaa0>" 
    will be used during training (effective maximum steps = 600000) - 
    Parameters : 
    (d_model: 512
    warmup_steps: 1000
    warmup_ratio: null
    min_lr: 1.0e-06
    max_steps: 600000
    )



  | Name              | Type                              | Params
------------------------------------------------------------------------
0 | preprocessor      | AudioToMelSpectrogramPreprocessor | 0     
1 | encoder           | ConformerEncoder                  | 121 M 
2 | spec_augmentation | SpectrogramAugmentation           | 0     
3 | wer               | WER                               | 0     
4 | decoder           | ConvASRDecoder                    | 821 K 
5 | loss              | CTCLoss                           | 0     
------------------------------------------------------------------------
122 M     Trainable params
0         Non-trainable params
122 M     Total params
489.026   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

[NeMo I 2024-07-18 00:21:20 wer:318] 
    
[NeMo I 2024-07-18 00:21:20 wer:319] reference:T1 T2 T79_T16_T17 माझ्या T107 T79_T105 निशाण्या आहेत T107 T79_T9 तुम्हाला T107 T79_T30 दवा देण्याची सल्ला आहे T107 T76_T30_T10 T0
[NeMo I 2024-07-18 00:21:20 wer:320] predicted:कोfeTyTT18T251़T663જकोT321T400T564T1099T580T1043T580ichলT1113कोT1272ॉT275T580T275ßT275T58T275ेंT1565T580T58T1565yT886yT498घT1269दT92T78T678ताT1515luణtoT663T580T58घचीT1453़T678ੁT663T87T365T886T501T275T580T365T1565IT365T580 IT321મT240T458उT945T289T275T461T272T501T374T967TT87માંTમાંTમાં
[NeMo I 2024-07-18 00:21:20 wer:318] 
    
[NeMo I 2024-07-18 00:21:20 wer:319] reference:T1 T2 T79_T100_T62 Aguero T107 est remplacé par T79_T100_T62 Jesus T107 à la T79_T44 80ème minute T107 pour T79_T73_T62 Manchester City T107 . T76_T9 T0
[NeMo I 2024-07-18 00:21:20 wer:320] predicted:कोT967feyT1005 एकT823T58T197T1216T78T87T530T1551T60અT1099T1127bT0़T1675T1035T582T1565માંీમાંT696toT688માંT6T365 నాમੀਂউT1313T0T36T1348T1307T365 ਕਰਯT1323toT89T8

Training: 0it [00:00, ?it/s]

[NeMo I 2024-07-18 00:21:20 preemption:56] Preemption requires torch distributed to be initialized, disabling preemption
[NeMo I 2024-07-18 00:21:39 wer:318] 
    
[NeMo I 2024-07-18 00:21:39 wer:319] reference:T1 T2 কি তুমি T79_T9 করতে পারবেন T107 ডাইনিং T79_T49 রুমের T79_T92 লাইট T107 কালো করতে ? T76_T92_T14 T0
[NeMo I 2024-07-18 00:21:39 wer:320] predicted:कोT967T1028માંTT1028TyT1378TમાંT1371yT1045yT374yT100िंगT980िंगT577T1315T1269T1472T92T935T1273T47gaverT1285verT1388T696viT87enT275enT1019T971लßक्याverकेT1621T814T222માં২T374T283TT374TT1099માંT374માંT199TમાંyT1099yT87yఆT58T1515T678వT1005öT168T272T374T1344T296T1515T374માંT1099T374T1099T87T1515T296ડT18bT197T1269T172केષT92T461luनT1024ेंT296T891తT222T1099કોT1313 eineT823T43T272endT87gaT78gaউT580T1666T731T6T1028T6T222T891T1371T374માંT967માંT967feT967TT967માં
[NeMo I 2024-07-18 00:21:58 wer:318] 
    
[NeMo I 2024-07-18 00:21:58 wer:319] reference:T1 T2 T79_T9 ਦੁਕਾਨ T107 ਦੀ T79_T49 ਲੱਭ T107 ਦੋ ਜਿ ⁇ ੇ T79_T36 ਚਾਵਲ T107 ਹਨ T76_T28_T90 T0
[N

Validation: 0it [00:00, ?it/s]

[NeMo I 2024-07-18 01:21:03 wer:318] 
    
[NeMo I 2024-07-18 01:21:03 wer:319] reference:T1 T2 T79_T16_T17 माझ्या T107 T79_T105 निशाण्या आहेत T107 T79_T9 तुम्हाला T107 T79_T30 दवा देण्याची सल्ला आहे T107 T76_T30_T10 T0
[NeMo I 2024-07-18 01:21:03 wer:320] predicted:T1 T2 T79_T17  T79_T9  T79_  T79__T104  T79. T76_T10 T0
[NeMo I 2024-07-18 01:21:03 wer:318] 
    
[NeMo I 2024-07-18 01:21:03 wer:319] reference:T1 T2 T79_T100_T62 Aguero T107 est remplacé par T79_T100_T62 Jesus T107 à la T79_T44 80ème minute T107 pour T79_T73_T62 Manchester City T107 . T76_T9 T0
[NeMo I 2024-07-18 01:21:03 wer:320] predicted:T1 T2 T79_T9  T107 __  T79   T79. T76_T10 T0
[NeMo I 2024-07-18 01:21:03 wer:318] 
    
[NeMo I 2024-07-18 01:21:03 wer:319] reference:T1 T2 T79_T9 Pausieren T107 Sie die T79_T115_T104 HIV-Test T107 und setzen Sie ihn später fort . T76_T6_T115 T0
[NeMo I 2024-07-18 01:21:03 wer:320] predicted:T1 T2 T79_T9  T107 T79_T9. T76_T10 T0
[NeMo I 2024-07-18 01:21:04 wer:318] 
    
[NeMo I 2024

Epoch 0, global step 1146: 'val_wer' reached 0.78622 (best 0.78622), saving model to '/external2/karan_exp/experiments/finetune-multidomain-alllang-100k-ctc/2024-07-18_00-21-12/checkpoints/finetune-multidomain-alllang-100k-ctc--val_wer=0.7862-epoch=0.ckpt' as top 3


[NeMo I 2024-07-18 01:21:55 nemo_model_checkpoint:177] New best .nemo model saved to: /external2/karan_exp/experiments/finetune-multidomain-alllang-100k-ctc/2024-07-18_00-21-12/checkpoints/finetune-multidomain-alllang-100k-ctc.nemo
[NeMo I 2024-07-18 01:22:18 wer:318] 
    
[NeMo I 2024-07-18 01:22:18 wer:319] reference:T1 T2 T79_T9 T79_T36 चावी T107 ची T79_T51 किंमत T107 तपासा T107 T76_T65_T51 T0
[NeMo I 2024-07-18 01:22:18 wer:320] predicted:T1 T2 T79_T9   T79_T104  T79  T76_T14 T0
[NeMo I 2024-07-18 01:22:38 wer:318] 
    
[NeMo I 2024-07-18 01:22:38 wer:319] reference:T1 T2 T79_T9 મારી T79_T75_T104 માહિતી T107 ની T79_T381_T104 ન્યૂઝ T107 જોવો . T76_T55_T381 T0
[NeMo I 2024-07-18 01:22:38 wer:320] predicted:T1 T2 T79_T9 માર _. T76_T10 T0
[NeMo I 2024-07-18 01:22:57 wer:318] 
    
[NeMo I 2024-07-18 01:22:57 wer:319] reference:T1 T2 T79_T100_T62 Oyarzabal END's versatility to play across the front line provides T79_T73_T62 Real Sociedad T107 with attacking options and unpredictabilit

Validation: 0it [00:00, ?it/s]

[NeMo I 2024-07-18 02:22:01 wer:318] 
    
[NeMo I 2024-07-18 02:22:01 wer:319] reference:T1 T2 T79_T16_T17 माझ्या T107 T79_T105 निशाण्या आहेत T107 T79_T9 तुम्हाला T107 T79_T30 दवा देण्याची सल्ला आहे T107 T76_T30_T10 T0
[NeMo I 2024-07-18 02:22:01 wer:320] predicted:T1 T2 T79_T16_T17 माझ्या T107 T79_T9 ी T107 T79_T105  T107 T79_T105 ी T79 ी T107 T76_T105_T10 T0
[NeMo I 2024-07-18 02:22:02 wer:318] 
    
[NeMo I 2024-07-18 02:22:02 wer:319] reference:T1 T2 T79_T100_T62 Aguero T107 est remplacé par T79_T100_T62 Jesus T107 à la T79_T44 80ème minute T107 pour T79_T73_T62 Manchester City T107 . T76_T9 T0
[NeMo I 2024-07-18 02:22:02 wer:320] predicted:T1 T2 T79_T9 Aer T107 T79T9 s T79s T79   T107. T76_T10 T0
[NeMo I 2024-07-18 02:22:02 wer:318] 
    
[NeMo I 2024-07-18 02:22:02 wer:319] reference:T1 T2 T79_T9 Pausieren T107 Sie die T79_T115_T104 HIV-Test T107 und setzen Sie ihn später fort . T76_T6_T115 T0
[NeMo I 2024-07-18 02:22:02 wer:320] predicted:T1 T2 T79_T9 Aen T107 Sie die T79_T104 

Epoch 1, global step 2292: 'val_wer' reached 0.69641 (best 0.69641), saving model to '/external2/karan_exp/experiments/finetune-multidomain-alllang-100k-ctc/2024-07-18_00-21-12/checkpoints/finetune-multidomain-alllang-100k-ctc--val_wer=0.6964-epoch=1.ckpt' as top 3


[NeMo I 2024-07-18 02:23:07 nemo_model_checkpoint:177] New best .nemo model saved to: /external2/karan_exp/experiments/finetune-multidomain-alllang-100k-ctc/2024-07-18_00-21-12/checkpoints/finetune-multidomain-alllang-100k-ctc.nemo
[NeMo I 2024-07-18 02:23:30 wer:318] 
    
[NeMo I 2024-07-18 02:23:30 wer:319] reference:T1 T2 T79_T9 Planen T107 Sie eine T79_T121_T104 Zugfahrt T107 nach T79_T69 München T107 für T79_T13 nächsten Monat T107 T76_T136 T0
[NeMo I 2024-07-18 02:23:30 wer:320] predicted:T1 T2 T79_T9 Aen T107 Sie T79_T104  T79_T104  T79_ en T107. T76_T82 T0
[NeMo I 2024-07-18 02:23:50 wer:318] 
    
[NeMo I 2024-07-18 02:23:50 wer:319] reference:T1 T2 T79_T9 ઓનિયન્સ T107 ને T79_T35 ખરીદી યાદી T107 માં થી દૂર કરો . T76_T103_T26 T0
[NeMo I 2024-07-18 02:23:50 wer:320] predicted:T1 T2 T79_T9 ર્ટ T79ી  T79ો કરો. T76_T123 T0
[NeMo I 2024-07-18 02:24:11 wer:318] 
    
[NeMo I 2024-07-18 02:24:11 wer:319] reference:T1 T2 Do you know how to T79_T9 find T107 a T79_T206_T104 chicken nood

Validation: 0it [00:00, ?it/s]

[NeMo I 2024-07-18 03:24:06 wer:318] 
    
[NeMo I 2024-07-18 03:24:06 wer:319] reference:T1 T2 T79_T16_T17 माझ्या T107 T79_T105 निशाण्या आहेत T107 T79_T9 तुम्हाला T107 T79_T30 दवा देण्याची सल्ला आहे T107 T76_T30_T10 T0
[NeMo I 2024-07-18 03:24:06 wer:320] predicted:T1 T2 T79_T16_T17 माझ्या T107 T79_T9 ख आहे T107 T79_T105 द T107 T79_T44 ददी T79T44  आहे T107 T76_T105_T10 T0
[NeMo I 2024-07-18 03:24:07 wer:318] 
    
[NeMo I 2024-07-18 03:24:07 wer:319] reference:T1 T2 T79_T100_T62 Aguero T107 est remplacé par T79_T100_T62 Jesus T107 à la T79_T44 80ème minute T107 pour T79_T73_T62 Manchester City T107 . T76_T9 T0
[NeMo I 2024-07-18 03:24:07 wer:320] predicted:T1 T2 T79_T9 A T107 T79s T79ps T107 T79 ment  é T107. T76_T10 T0
[NeMo I 2024-07-18 03:24:07 wer:318] 
    
[NeMo I 2024-07-18 03:24:07 wer:319] reference:T1 T2 T79_T9 Pausieren T107 Sie die T79_T115_T104 HIV-Test T107 und setzen Sie ihn später fort . T76_T6_T115 T0
[NeMo I 2024-07-18 03:24:07 wer:320] predicted:T1 T2 T79_T9 Aieren 

Epoch 2, global step 3438: 'val_wer' reached 0.67005 (best 0.67005), saving model to '/external2/karan_exp/experiments/finetune-multidomain-alllang-100k-ctc/2024-07-18_00-21-12/checkpoints/finetune-multidomain-alllang-100k-ctc--val_wer=0.6701-epoch=2.ckpt' as top 3


[NeMo I 2024-07-18 03:25:06 nemo_model_checkpoint:177] New best .nemo model saved to: /external2/karan_exp/experiments/finetune-multidomain-alllang-100k-ctc/2024-07-18_00-21-12/checkpoints/finetune-multidomain-alllang-100k-ctc.nemo
[NeMo I 2024-07-18 03:25:29 wer:318] 
    
[NeMo I 2024-07-18 03:25:29 wer:319] reference:T1 T2 T79_T9 Plan T107 a trip to T79_T69 Barcelona T107 for T79_T13 February 20th T107 T76_T1106 T0
[NeMo I 2024-07-18 03:25:29 wer:320] predicted:T1 T2 T79_T9 Po T107 T79_T104 coc T107 T79  T107 for T79_T13  T107 T76_T10 T0
[NeMo I 2024-07-18 03:25:48 wer:318] 
    
[NeMo I 2024-07-18 03:25:48 wer:319] reference:T1 T2 T79_T9 T79_T121_T104 बाइक T107 की T79_T85_T36 स्थिति T107 देखें T107 T79_T69 बर्लिन T107 के लिए T76_T65 T0
[NeMo I 2024-07-18 03:25:48 wer:320] predicted:T1 T2 T79_T9 T79_  T107  की T79__T104   ी T79_ ब T107  िए T76_T51 T0
[NeMo I 2024-07-18 03:26:08 wer:318] 
    
[NeMo I 2024-07-18 03:26:08 wer:319] reference:T1 T2 T79_T9 કોસ્ટ્કો માં T79_T36 બિસ્કિટ્સ 

Validation: 0it [00:00, ?it/s]

[NeMo I 2024-07-18 04:24:54 wer:318] 
    
[NeMo I 2024-07-18 04:24:54 wer:319] reference:T1 T2 T79_T16_T17 माझ्या T107 T79_T105 निशाण्या आहेत T107 T79_T9 तुम्हाला T107 T79_T30 दवा देण्याची सल्ला आहे T107 T76_T30_T10 T0
[NeMo I 2024-07-18 04:24:54 wer:320] predicted:T1 T2 T79_T16_T17 माझ्या T107 T79_T9 न आहे T107 T79_T105 ला T79द आहे T107 T76_T105_T10 T0
[NeMo I 2024-07-18 04:24:54 wer:318] 
    
[NeMo I 2024-07-18 04:24:54 wer:319] reference:T1 T2 T79_T100_T62 Aguero T107 est remplacé par T79_T100_T62 Jesus T107 à la T79_T44 80ème minute T107 pour T79_T73_T62 Manchester City T107 . T76_T9 T0
[NeMo I 2024-07-18 04:24:54 wer:320] predicted:T1 T2 T79_T9 Are T107 T79T104 ps T107 ps à T79   pour T79it T107. T76_T10 T0
[NeMo I 2024-07-18 04:24:54 wer:318] 
    
[NeMo I 2024-07-18 04:24:54 wer:319] reference:T1 T2 T79_T9 Pausieren T107 Sie die T79_T115_T104 HIV-Test T107 und setzen Sie ihn später fort . T76_T6_T115 T0
[NeMo I 2024-07-18 04:24:54 wer:320] predicted:T1 T2 T79_T9 Atuieren T107 

Epoch 3, global step 4584: 'val_wer' reached 0.63581 (best 0.63581), saving model to '/external2/karan_exp/experiments/finetune-multidomain-alllang-100k-ctc/2024-07-18_00-21-12/checkpoints/finetune-multidomain-alllang-100k-ctc--val_wer=0.6358-epoch=3.ckpt' as top 3


[NeMo I 2024-07-18 04:25:58 nemo_model_checkpoint:177] New best .nemo model saved to: /external2/karan_exp/experiments/finetune-multidomain-alllang-100k-ctc/2024-07-18_00-21-12/checkpoints/finetune-multidomain-alllang-100k-ctc.nemo
[NeMo I 2024-07-18 04:26:22 wer:318] 
    
[NeMo I 2024-07-18 04:26:22 wer:319] reference:T1 T2 T79_T9 મારી T79_T26_T104 ખરીદી લિસ્ટ T107 માં T79_T109 2 T107 T79_T36 પેન્સિલ T107 ઉમેરો T107 T76_T58_T43_T26 T0
[NeMo I 2024-07-18 04:26:22 wer:320] predicted:T1 T2 T79_T9 મારી T79_T26_T104 ખરીદ િસ _T109 T107 T79_T36 પસ્ સલ T107 ઉમેો T107 T76_T58_T43_T26 T0
[NeMo I 2024-07-18 04:26:42 wer:318] 
    
[NeMo I 2024-07-18 04:26:42 wer:319] reference:T1 T2 तुम्हाला T79_T92 विंडो शटर T107 T79_T9 क्लोज T107 करायचं आहे का ? T76_T92_T14 T0
[NeMo I 2024-07-18 04:26:42 wer:320] predicted:T1 T2 ुला T79_T92 लो च T107 T79_T60_T9 ख् T107 आहे? T76_T92_T14 T0
[NeMo I 2024-07-18 04:27:07 wer:318] 
    
[NeMo I 2024-07-18 04:27:07 wer:319] reference:T1 T2 T79_T100_T62 रियल बेतिस T1

Validation: 0it [00:00, ?it/s]

[NeMo I 2024-07-18 05:26:43 wer:318] 
    
[NeMo I 2024-07-18 05:26:43 wer:319] reference:T1 T2 T79_T16_T17 माझ्या T107 T79_T105 निशाण्या आहेत T107 T79_T9 तुम्हाला T107 T79_T30 दवा देण्याची सल्ला आहे T107 T76_T30_T10 T0
[NeMo I 2024-07-18 05:26:43 wer:320] predicted:T1 T2 T79_T16_T17 माझ्या T107 T79_T105 नि आहे T107 T79_T105 तुला T44 ददिीची स आहे T107 T76_T105_T10 T0
[NeMo I 2024-07-18 05:26:43 wer:318] 
    
[NeMo I 2024-07-18 05:26:43 wer:319] reference:T1 T2 T79_T100_T62 Aguero T107 est remplacé par T79_T100_T62 Jesus T107 à la T79_T44 80ème minute T107 pour T79_T73_T62 Manchester City T107 . T76_T9 T0
[NeMo I 2024-07-18 05:26:43 wer:320] predicted:T1 T2 T79_T9 Are T107 T79T9 ppse T107 T79par T107 à T79_   pour T79it. T76_T10 T0
[NeMo I 2024-07-18 05:26:43 wer:318] 
    
[NeMo I 2024-07-18 05:26:43 wer:319] reference:T1 T2 T79_T9 Pausieren T107 Sie die T79_T115_T104 HIV-Test T107 und setzen Sie ihn später fort . T76_T6_T115 T0
[NeMo I 2024-07-18 05:26:43 wer:320] predicted:T1 T2 T79

Epoch 4, global step 5730: 'val_wer' reached 0.60913 (best 0.60913), saving model to '/external2/karan_exp/experiments/finetune-multidomain-alllang-100k-ctc/2024-07-18_00-21-12/checkpoints/finetune-multidomain-alllang-100k-ctc--val_wer=0.6091-epoch=4.ckpt' as top 3


[NeMo I 2024-07-18 05:27:49 nemo_model_checkpoint:177] New best .nemo model saved to: /external2/karan_exp/experiments/finetune-multidomain-alllang-100k-ctc/2024-07-18_00-21-12/checkpoints/finetune-multidomain-alllang-100k-ctc.nemo
[NeMo I 2024-07-18 05:28:16 wer:318] 
    
[NeMo I 2024-07-18 05:28:16 wer:319] reference:T1 T2 T79_T9 મારું T79_T38_T104 પૂર્ણ T107 કરો T79_T44 સાથે T107 . T76_T31_T38 T0
[NeMo I 2024-07-18 05:28:16 wer:320] predicted:T1 T2 T79_T9 મારું T79_T104 પણ કરો T107 T79_T44 સે. T76_T105_T120 T0
[NeMo I 2024-07-18 05:28:37 wer:318] 
    
[NeMo I 2024-07-18 05:28:37 wer:319] reference:T1 T2 T79_T16_T17 నాకు T107 T79_T9 తెలియజేయండి T107 T79_T105 నా తొందరలు T107 T79_T13 ఇరవు నుండి T107 T76_T105_T10 T0
[NeMo I 2024-07-18 05:28:37 wer:320] predicted:T1 T2 T79_T16_T17 నాకు T107 T79_T9 తెిించ చేయండి T107 T79_T105 నా రను T107 T79_T13 జోండి T107 T76_T105_T10 T0
[NeMo I 2024-07-18 05:28:55 wer:318] 
    
[NeMo I 2024-07-18 05:28:55 wer:319] reference:T1 T2 T79_T16_T17 My aunt 