## For valdation with given model path

In [1]:
import os
import sys
sys.path.insert(0, os.path.abspath('/home/bld56/gsoc/nemo/NeMo-opensource/'))
import nemo.core as nemo_core
from nemo.core import adapter_mixins
from nemo.utils import exp_manager
import nemo.collections.asr as nemo_asr
import nemo
import json
from omegaconf import OmegaConf, open_dict
import torch
from pytorch_lightning import Trainer
from lightning.pytorch.loggers import WandbLogger
from torchmetrics.text import WordErrorRate
import warnings
import argparse

In [2]:
def load_and_configure_model(config_file_path):
    conf = OmegaConf.load(config_file_path)
    overrides = OmegaConf.from_cli()
    updated_conf = OmegaConf.merge(conf, overrides)
    OmegaConf.set_struct(updated_conf, True)
    model = nemo_asr.models.AV_EncDecCTCModelBPE(updated_conf)

    model.setup_training_data(model.cfg.train_ds)
    return model, conf

# Function to freeze and unfreeze model parameters based on adapters
def manage_model_adapters(model, conf):
    # Freeze the entire model
    model.freeze()
    
    # Determine which modules to train based on configuration
    if model.cfg.use_video_modality:
        modules_to_train = [
            model.a_linear, model.v_linear, model.av_encoder, model.av_enocder_layer, 
            model.a_modal_embs, model.v_modal_embs, model.decoder, model.a_pos_enc, model.v_pos_enc
        ]
    elif not model.cfg.use_video_modality and model.cfg.use_pretrained_dec:
        modules_to_train = [model.a_model.decoder]
    else:  # not model.cfg.use_video_modality and not model.cfg.use_pretrained_dec
        modules_to_train = [model.decoder]
    
    # Set the selected modules to training mode and enable gradients
    for module in modules_to_train:
        module.train()
        for param in module.parameters():
            param.requires_grad = True

    # Handle adapter configurations if needed
    if conf.adapters.linear_adapter.keep:
        model.a_model.freeze()
        model.a_model.set_enabled_adapters(enabled=False)
        model.a_model.set_enabled_adapters(name=conf.adapters.linear_adapter.name, enabled=True)
        model.a_model.unfreeze_enabled_adapters()
    else:
        model.a_model.unfreeze()

# Function to set up the trainer
def setup_trainer():
    torch.set_float32_matmul_precision('high')
    accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'
    trainer = Trainer(
        devices=1, accelerator=accelerator, 
        # strategy="ddp_find_unused_parameters_true",
        # strategy="ddp_notebook",
        max_epochs=100,
        enable_checkpointing=False, logger=False,
        log_every_n_steps=5, check_val_every_n_epoch=1,
    )
    return trainer

# Function to set up experiment manager
def setup_exp_manager(trainer, model):
    os.environ.pop('NEMO_EXPM_VERSION', None)

    exp_config = exp_manager.ExpManagerConfig(
        exp_dir=model.cfg.exp_dir,
        name=f'{model.cfg.wandb.run_name}',
        checkpoint_callback_params=exp_manager.CallbackParams(
            monitor="val_u_wer",
            mode="min",
            always_save_nemo=True,
            save_best_model=True,
        ),
        create_wandb_logger=model.cfg.wandb.create_wandb_logger,
        wandb_logger_kwargs=OmegaConf.create({"project": f"{model.cfg.wandb.project}", "name": f"{model.cfg.wandb.run_name}_{model.cfg.train_ds.override_snr_ratio}", "log_model": model.cfg.wandb.log_model}),
    )

    exp_config = OmegaConf.structured(exp_config)
    logdir = exp_manager.exp_manager(trainer, exp_config)
    if model.cfg.wandb.create_wandb_logger:
        trainer.loggers[1].log_hyperparams(OmegaConf.to_container(model.cfg)) # wandb logger
        # log the manifest file to wandb server
        trainer.loggers[1].experiment.log_artifact(f"{model.cfg.train_ds.manifest_filepath}")
        trainer.loggers[1].experiment.log_artifact(f"{model.cfg.validation_ds.manifest_filepath}")
        
    return logdir


In [3]:
# Main function to execute the workflow
# def main(config_file_path, args):
# config_file_path = '/home/bld56/gsoc/nemo/NeMo-opensource/balu_codes/configs/c1.yaml'
# model, conf = load_and_configure_model(config_file_path)
# ckpt_path = f"/tmp/bld56_dataset_v1/saved_models/pre_av_ndec_uman_ntok--val_u_wer=0.0809-epoch=11.ckpt"
ckpt_path = f"/home/bld56/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo"
model = nemo_asr.models.AV_EncDecCTCModelBPE.restore_from(ckpt_path, override_config_path=None) 
model.cfg.train_ds.manifest_filepath = '/tmp/bld56_dataset_v1/it2/annotations/manifest_train_no_label.json'
model.cfg.validation_ds.manifest_filepath = '/tmp/bld56_dataset_v1/it2/annotations/manifest_eval_no_label.json'
model.cfg.test_ds.manifest_filepath = '/tmp/bld56_dataset_v1/it2/annotations/manifest_test_no_label.json'
print(model)
# model.cfg.wandb.run_name += 'pre+'
# manage_model_adapters(model, conf)

[NeMo I 2024-08-25 15:44:38 mixins:172] Tokenizer SentencePieceTokenizer initialized with 128 tokens


[NeMo W 2024-08-25 15:44:38 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath:
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket1/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket2/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket3/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket4/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket5/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket6/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket7/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket8/tarred_audio_manifest.json
    sample_rate: 16000
    batch_size: 1
    shuffle: true
    num_workers: 4
    pin_memory: true
    use_start_end_token: false
    trim_

[NeMo I 2024-08-25 15:44:38 features:305] PADDING: 0


[NeMo I 2024-08-25 15:44:39 save_restore_connector:263] Model EncDecCTCModelBPE was successfully restored from /home/bld56/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo.
EncDecCTCModelBPE(
  (preprocessor): AudioToMelSpectrogramPreprocessor(
    (featurizer): FilterbankFeatures()
  )
  (encoder): ConformerEncoder(
    (pre_encode): ConvSubsampling(
      (out): Linear(in_features=10240, out_features=512, bias=True)
      (conv): Sequential(
        (0): Conv2d(1, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        (1): ReLU(inplace=True)
        (2): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        (3): ReLU(inplace=True)
      )
    )
    (pos_enc): RelPositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): ModuleList(
      (0-17): 18 x ConformerLayer(
        (norm_feed_forward1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
   

In [4]:
trainer = setup_trainer()
model.set_trainer(trainer)

[NeMo W 2024-08-25 15:44:39 nemo_logging:349] /home/bld56/.miniconda3/envs/nemo/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /home/bld56/.miniconda3/envs/nemo/lib/python3.10/sit ...
    
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [12]:
trainer.validate(model)

[NeMo W 2024-08-25 15:46:29 nemo_logging:349] /home/bld56/.miniconda3/envs/nemo/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /home/bld56/.miniconda3/envs/nemo/lib/python3.10/sit ...
    
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
[NeMo W 2024-08-25 15:46:29 nemo_logging:349] /home/bld56/.miniconda3/envs/nemo/lib/python3.10/site-packages/pytorch_lightning/utilities/data.py:105: Total length of `list` across ranks is zero. Please make sure this was your intention.
    


[]

## From Aug 16 Weekly meet to develop to transcribe fucniton

In [30]:
import os
import sys
sys.path.insert(0, os.path.abspath('/home/bld56/gsoc/nemo/NeMo-opensource/'))
import nemo.collections.asr as nemo_asr
import json
import nemo.collections.asr.data.av_to_text

In [31]:

# Function to load the model from a .nemo file
def load_model(nemo_file_path):
    model = nemo_asr.models.AV_EncDecCTCModelBPE.restore_from(nemo_file_path)
    model.eval()
    return model

# Function to perform inference on a single sample
def infer_single_sample(model, sample):
    # Prepare input data
    audio_file = sample['audio_filepath']
    video_file = sample['video_filepath']
    feature_file = sample['feature_file']
    duration = sample['duration']
    
    # Perform inference
    transcription = model.transcribe(
        audio=[audio_file],
        return_hypotheses = True,
        override_duration = duration,
    )
    
    return transcription[0]



In [32]:
import sentencepiece as spm

# Load the tokenizer model from the specified path
def load_tokenizer(tokenizer_model_path):
    tokenizer = spm.SentencePieceProcessor()
    tokenizer.load(tokenizer_model_path)
    return tokenizer

# tokenizer_path = "/home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/tokenizer.model"
# for i in range(self.tokenizer.vocab_size):
#         piece = self.tokenizer.ids_to_tokens([i])
#         piece = piece[0]
#         vocabulary[piece] = i + 1
# tokenizer = load_tokenizer(tokenizer_path)
config = {}

In [33]:
manifest_file_path = '/tmp/bld56_dataset_v1/it2/annotations/manifest_eval.json'  # Path to your input manifest file
nemo_file_path = '/tmp/bld56_dataset_v1/tmp/av_ndec_lman_ntok_0.5/2024-08-16_11-16-34/checkpoints/av_ndec_lman_ntok_0.5.nemo'  # Path to your trained .nemo file
output_file_path = 'temp.json'  # Path to save the inference results
model = load_model(nemo_file_path)
model.to('cpu')

[NeMo I 2024-08-17 12:36:32 mixins:172] Tokenizer SentencePieceTokenizer initialized with 356 tokens


[NeMo W 2024-08-17 12:36:32 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_train.json
    video_frame_rate: 5
    get_vid_feats: true
    get_zero_vid_feats: false
    sample_rate: 16000
    batch_size: 32
    shuffle: true
    num_workers: 11
    pin_memory: true
    use_start_end_token: false
    trim_silence: false
    max_duration: 20.0
    min_duration: 0.1
    is_tarred: false
    tarred_audio_filepaths: null
    shuffle_n: 2048
    bucketing_strategy: synced_randomized
    override_snr_ratio: 0.7
    bucketing_batch_size:
    - 34
    - 30
    - 26
    - 22
    - 18
    - 16
    - 12
    - 8
    
[NeMo W 2024-08-17 12:36:32 modelPT:183] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data()

[NeMo I 2024-08-17 12:36:32 cloud:58] Found existing object /home/bld56/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo.
[NeMo I 2024-08-17 12:36:32 cloud:64] Re-using file from: /home/bld56/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo
[NeMo I 2024-08-17 12:36:32 common:815] Instantiating model from pre-trained checkpoint
Updated encoder _target_ model : nemo.collections.asr.modules.conformer_encoder.ConformerEncoderAdapter
[NeMo I 2024-08-17 12:36:32 cloud:58] Found existing object /home/bld56/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo.
[NeMo I 2024-08-17 12:36:32 cloud:64] Re-using file from: /home/bld56/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo
[NeMo I 2024-08-17 12:36:3

[NeMo W 2024-08-17 12:36:33 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath:
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket1/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket2/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket3/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket4/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket5/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket6/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket7/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket8/tarred_audio_manifest.json
    sample_rate: 16000
    batch_size: 1
    shuffle: true
    num_workers: 4
    pin_memory: true
    use_start_end_token: false
    trim_

[NeMo I 2024-08-17 12:36:33 features:305] PADDING: 0
[NeMo I 2024-08-17 12:36:34 save_restore_connector:263] Model EncDecCTCModelBPE was successfully restored from /home/bld56/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo.
[NeMo I 2024-08-17 12:36:34 save_restore_connector:263] Model AV_EncDecCTCModelBPE was successfully restored from /tmp/bld56_dataset_v1/tmp/av_ndec_lman_ntok_0.5/2024-08-16_11-16-34/checkpoints/av_ndec_lman_ntok_0.5.nemo.


AV_EncDecCTCModelBPE(
  (a_model): EncDecCTCModelBPE(
    (preprocessor): AudioToMelSpectrogramPreprocessor(
      (featurizer): FilterbankFeatures()
    )
    (encoder): ConformerEncoderAdapter(
      (pre_encode): ConvSubsampling(
        (out): Linear(in_features=10240, out_features=512, bias=True)
        (conv): Sequential(
          (0): Conv2d(1, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (1): ReLU(inplace=True)
          (2): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (3): ReLU(inplace=True)
        )
      )
      (pos_enc): RelPositionalEncoding(
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (layers): ModuleList(
        (0-17): 18 x ConformerLayer(
          (norm_feed_forward1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (feed_forward1): ConformerFeedForward(
            (linear1): Linear(in_features=512, out_features=2048, bias=True)
            (activation): Swish()
           

In [34]:
dataset = nemo.collections.asr.data.av_to_text.AVToBPEDataset(
        manifest_filepath='/tmp/bld56_dataset_v1/it2/annotations/manifest_eval.json',
        tokenizer= model.tokenizer,
        sample_rate= 16000,
        int_values=config.get('int_values', False),
        max_duration=config.get('max_duration', None),
        min_duration=config.get('min_duration', None),
        max_utts=config.get('max_utts', 0),
        trim=config.get('trim_silence', False),
        use_start_end_token=config.get('use_start_end_token', True),
        return_sample_id=config.get('return_sample_id', False),
        channel_selector=config.get('channel_selector', None),
        video_frame_rate=config.get('video_frame_rate', 5),
        get_vid_feats=config.get('get_vid_feats', True),
        get_zero_vid_feats = config.get('get_zero_vid_feats', False),
        override_snr_ratio = config.get('override_snr_ratio', None),
    )

[NeMo I 2024-08-17 12:36:34 collections:321] Dataset loaded with 2200 files totalling 6.11 hours
[NeMo I 2024-08-17 12:36:34 collections:323] 0 files were filtered totalling 0.00 hours


In [35]:
import torch
batch_size = 1
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, collate_fn=dataset.collate_fn)

In [41]:
signal, signal_len, video_input_signal, transcript, transcript_len = dataloader.__iter__().__next__()
log_probs, encoded_len, predictions = model.forward(audio_input_signal=signal, audio_input_signal_length=signal_len, video_input_signal=video_input_signal)
loss_value = model.loss(
            log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len
        )
# print(transcript, predictions)
# tokenizer = load_tokenizer('/home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_toknizer/tokenizer.model')
# model.wer.decoding.decode_tokens_to_str(predictions[0].cpu().numpy().tolist())
# replace predictions[0] where 356 to 355
predictions[0][predictions[0] == 356] = 355
print(model.wer.decoding.decode_tokens_to_str(transcript[0].cpu().numpy().tolist()))
print('\n')
print(model.wer.decoding.decode_tokens_to_str(predictions[0].cpu().numpy().tolist()))

so regular and complete a part of normal everyday living that finding newspapers on the news then buying them morning and night was takenso regular and complete a part of normal everyday living that finding newspapers on the news then buying them morning and night was taken <N11>


<N228>+<N228>+<N228>+<N228>+<N228>+<N228>+ re reguular and and com compleletee a p parart of of n normmalal eververyy dayay l liivving that that f findding n nwssppaapperss on on the neewssstandnds b buyying the them m mororninging and and n niightt<N228>+ wasas t taakinging<N228>+<N228>+<N228>+rereggullar and and compleletee a a parart of of norormmal e eververy d dayay l liivving that that f findinging n nwssppaapperss on the the neewssstandnds b buyyinging themm m mornning and and<N228>+ n nighght<N228>+ wasas t t takk<N228>+ing<N228>+ <N36>ararararllararararararararlararararararararararararararararararararararlararararararararlararar


In [39]:
import re
temp_str = model.wer.decoding.decode_tokens_to_str(predictions[0].cpu().numpy().tolist())
r_tags = re.findall(r'<N\d+>', temp_str)
for tag in r_tags:
    unlabelled_h = temp_str.replace(tag, '')
print(unlabelled_h)

<N228>+<N228>+<N228>+<N228>+<N228>+<N228>+ re reguular and and com compleletee a p parart of of n normmalal eververyy dayay l liivving that that f findding n nwssppaapperss on on the neewssstandnds b buyying the them m mororninging and and n niightt<N228>+ wasas t taakinging<N228>+<N228>+<N228>+rereggullar and and compleletee a a parart of of norormmal e eververy d dayay l liivving that that f findinging n nwssppaapperss on the the neewssstandnds b buyyinging themm m mornning and and<N228>+ n nighght<N228>+ wasas t t takk<N228>+ing<N228>+ ararararllararararararararlararararararararararararararararararararararlararararararararlararar


In [None]:

# Function to run inference on a manifest file
def run_inference(manifest_file_path, nemo_file_path, output_file_path):
    # Load the model
    model = load_model(nemo_file_path)
    
    # Read the manifest file
    with open(manifest_file_path, 'r') as f:
        manifest_data = [json.loads(line.strip()) for line in f]
    
    # Run inference on each sample in the manifest
    results = []
    for sample in manifest_data:
        transcription = infer_single_sample(model, sample)
        result = {
            'audio_filepath': sample['audio_filepath'],
            'video_filepath': sample['video_filepath'],
            'feature_file': sample['feature_file'],
            'duration': sample['duration'],
            'transcription': transcription
        }
        results.append(result)
    
    # Save the results to the output file
    with open(output_file_path, 'w') as f:
        for result in results:
            f.write(json.dumps(result) + '\n')

    print(f"Inference completed. Results saved to {output_file_path}")

# Main function
def main():
    manifest_file_path = '/tmp/bld56_dataset_v1/it2/annotations/manifest_eval.json'  # Path to your input manifest file
    nemo_file_path = '/tmp/bld56_dataset_v1/tmp/av_ndec_lman_ntok_0.5/2024-08-16_11-16-34/checkpoints/av_ndec_lman_ntok_0.5.nemo'  # Path to your trained .nemo file
    output_file_path = 'temp.json'  # Path to save the inference results
    
    run_inference(manifest_file_path, nemo_file_path, output_file_path)

if __name__ == "__main__":
    main()
