In [None]:
import torch
import os
from omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer

import nemo.collections.asr as nemo_asr
from pathlib import Path


In [None]:
# Load the model from a local .nemo file

MODEL_ROOT = "/external2/models/hf/stt_en_conformer_ctc_large/"
MODEL_ROOT = Path(MODEL_ROOT)
model_path = MODEL_ROOT / "stt_en_conformer_ctc_large.nemo"

cfg = nemo_asr.models.ASRModel.restore_from(restore_path=model_path, return_config=True)

# If you need to get the configuration, you can access it from the loaded model

print(cfg)

In [None]:
# from nemo.core import adapter_mixins

# # Utility method to check and update the model config
# def update_model_config_to_support_adapter(model_cfg):
#     with open_dict(model_cfg):
#         adapter_metadata = adapter_mixins.get_registered_adapter(model_cfg.encoder._target_)
#         if adapter_metadata is not None:
#             model_cfg.encoder._target_ = adapter_metadata.adapter_class_path

#     print("Updated encoder _target_ model :", model_cfg.encoder._target_)
#     return model_cfg

In [None]:
#cfg = update_model_config_to_support_adapter(cfg)

In [None]:
model = nemo_asr.models.ASRModel.restore_from(model_path, override_config_path=cfg)

In [None]:
data_dir = "/home/ksingla/workspace/PromptingNemo/data_v2/synthetic/processed/"
TRAIN_MANIFEST = os.path.join(data_dir, "train.json")
TEST_MANIFEST = os.path.join(data_dir, "valid.json")
ALL_TAGS = os.path.join(data_dir, "alltags_uniq.txt")
taglist = []
with open(ALL_TAGS, 'r') as f:
    for line in f:
        word, tag = line.split()
        taglist.append(tag)

In [None]:
import os
import sentencepiece as spm
import json
import logging

def train_sentencepiece_tokenizer(manifest_file, tokenizer_folder, special_tokens=None, vocab_size=5000):
    # Configure logging
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

    logging.info("Starting the tokenizer training process")

    # Step 1: Read the manifest file and extract text data
    def read_manifest(manifest_path):
        with open(manifest_path, 'r') as f:
            lines = f.readlines()
        return [json.loads(line)['text'] for line in lines]
    
    logging.info("Reading manifest file")
    text_data = read_manifest(manifest_file)
    logging.info(f"Extracted {len(text_data)} sentences from the manifest file")
    
    # Step 2: Save the extracted text to a temporary file
    if not os.path.exists(tokenizer_folder):
        os.makedirs(tokenizer_folder)
    
    temp_text_file = os.path.join(tokenizer_folder, 'text_data.txt')
    logging.info(f"Saving extracted text to {temp_text_file}")
    with open(temp_text_file, 'w') as f:
        for sentence in text_data:
            f.write(sentence + '\n')
    
    # Step 3: Train the SentencePiece tokenizer with special tokens if provided
    model_prefix = os.path.join(tokenizer_folder, 'tokenizer')
    
    # Prepare special tokens string
    if special_tokens:
        user_defined_symbols = ','.join(special_tokens)
        logging.info(f"Special tokens provided: {special_tokens}")
        logging.info("Starting SentencePiece training with special tokens")
        spm.SentencePieceTrainer.train(
            input=temp_text_file, 
            model_prefix=model_prefix, 
            vocab_size=vocab_size,
            user_defined_symbols=user_defined_symbols
        )
    else:
        logging.info("Starting SentencePiece training without special tokens")
        spm.SentencePieceTrainer.train(
            input=temp_text_file, 
            model_prefix=model_prefix, 
            vocab_size=vocab_size
        )
    
    # Step 4: Return the paths to the tokenizer model and vocab files
    model_file = f"{model_prefix}.model"
    vocab_file = f"{model_prefix}.vocab"

    logging.info(f"Tokenizer training completed")
    logging.info(f"Model file: {model_file}")
    logging.info(f"Vocab file: {vocab_file}")

    # Step 5: Create a vocab.txt file
    vocab_txt_file = os.path.join(tokenizer_folder, 'vocab.txt')
    logging.info(f"Creating vocab.txt file at {vocab_txt_file}")
    with open(vocab_file, 'r') as vf, open(vocab_txt_file, 'w') as vtf:
        for line in vf:
            token = line.split('\t')[0]
            vtf.write(token + '\n')
    
    logging.info(f"vocab.txt file created at {vocab_txt_file}")
    
    return model_file, vocab_file, vocab_txt_file

In [None]:
taglist = open(ALL_TAGS, 'r').read().splitlines()
taglist = [tag.split()[1] for tag in taglist]


In [None]:
train_sentencepiece_tokenizer(TRAIN_MANIFEST, "/home/ksingla/workspace/PromptingNemo/data_v2/synthetic/processed/tokenizer", special_tokens=taglist, vocab_size=1600)

In [None]:
import subprocess
import os
import sentencepiece as spm
import logging
import sys
import json

def generate_sentencepiece_model_pb2(script_dir, proto_file_path):
    # Construct the command
    command = [
        'protoc',
        f'--python_out={script_dir}',
        proto_file_path
    ]
    
    try:
        # Run the command
        subprocess.run(command, check=True)
        print("Successfully generated sentencepiece_model_pb2.py")
    except subprocess.CalledProcessError as e:
        print(f"Error generating sentencepiece_model_pb2.py: {e}")
        sys.exit(1)

def edit_spt_model(input_file, output_folder, tokens, vocab_file, vocab_txt_file, is_userdefined=False):
    from sentencepiece_model_pb2 import ModelProto  # Ensure this import is after the proto generation
    
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    output_model_file = os.path.join(output_folder, 'tokenizer.model')
    output_vocab_file = os.path.join(output_folder, 'tokenizer.vocab')
    output_vocab_txt_file = os.path.join(output_folder, 'vocab.txt')

    token_type = 3
    if is_userdefined:
        token_type = 4

    model = ModelProto()
    model.ParseFromString(open(input_file, 'rb').read())

    existing_tokens = {piece.piece for piece in model.pieces}

    new_tokens = []
    for token in tokens:
        if token in existing_tokens:
            logging.warning(f"Special Token '{token}' already exists in the input model, skipping.")
            continue
        piece = model.SentencePiece(piece=token, score=0.0, type=token_type)
        model.pieces.append(piece)
        new_tokens.append(token)

    sp = spm.SentencePieceProcessor()
    try:
        sp.LoadFromSerializedProto(model.SerializeToString())
        for token in new_tokens:
            id = sp.piece_to_id(token)
            logging.info(f"Created token '{token}' at ID {id}")
        logging.info(f"New tokenizer vocab size: {sp.get_piece_size()}")
    except:
        logging.error("Could not appropriately configure new tokenizer. Verify if the special tokens already exist.")
        sys.exit(1)

    with open(output_model_file, 'wb') as outf:
        outf.write(model.SerializeToString())

    logging.info(f"Created new tokenizer at: {output_model_file}")

    # Read the original vocab file and append the new tokens
    with open(vocab_file, 'r') as original_vocab_file:
        original_vocab = original_vocab_file.readlines()

    with open(output_vocab_file, 'w') as updated_vocab_file:
        updated_vocab_file.writelines(original_vocab)
        for token in new_tokens:
            updated_vocab_file.write(f"{token}\n")

    # Update vocab.txt
    with open(vocab_txt_file, 'r') as original_vocab_txt_file:
        original_vocab_txt = original_vocab_txt_file.readlines()

    with open(output_vocab_txt_file, 'w') as updated_vocab_txt_file:
        updated_vocab_txt_file.writelines(original_vocab_txt)
        for token in new_tokens:
            updated_vocab_txt_file.write(f"{token}\n")

    logging.info(f"Updated vocab files: {output_vocab_file}, {output_vocab_txt_file}")

def update_model_config(model, new_model_path):
    model['cfg']['tokenizer']['model_path'] = new_model_path
    logging.info(f"Updated model configuration with new tokenizer model path: {new_model_path}")


# # Define input and output paths
# input_folder = MODEL_ROOT / "tokenizer"
# output_folder = MODEL_ROOT / "new_tokenizer"


# #input_folder = '/external/ksingla/models/nemo/stt_en_conformer_ctc_small/tokenizer'
# #output_folder = '/external/ksingla/models/nemo/stt_en_conformer_ctc_small/new_tokenizer'
# #proto_dir = '/path/to/save/proto'  # Define the actual path where the proto file should be saved
# #proto_file = '/path/to/sentencepiece_model.proto'  # Define the actual path to the sentencepiece_model.proto file

# input_file = input_folder / 'tokenizer.model'
# vocab_file = input_folder / 'tokenizer.vocab'
# vocab_txt_file = input_folder / 'vocab.txt'

# # input_file = os.path.join(input_folder, 'tokenizer.model')
# # vocab_file = os.path.join(input_folder, 'tokenizer.vocab')
# # vocab_txt_file = os.path.join(input_folder, 'vocab.txt')

# # Include all single-digit integers in the tokens list
# punctuations = ['.', ',', '?', '!', ';', ':', '-', '(', ')', '[', ']', '{', '}', '<', '>', '/', '\\', '|', '@', '#', '$', '%', '^', '&', '*', '+', '=', '~', '`', '_', '"', "'"]
# tokens = taglist + [str(i) for i in range(10)] + punctuations
# is_userdefined = True

# # Step 1: Generate the sentencepiece_model_pb2.py file
# #generate_sentencepiece_model_pb2(proto_dir, proto_file)

# # Step 2: Edit the SentencePiece model
# edit_spt_model(input_file, output_folder, tokens, vocab_file, vocab_txt_file, is_userdefined)

# Step 3: Load the model configuration and update it
# model_config_file = '/path/to/model/config.json'  # Define the actual path to the model config file
# with open(model_config_file, 'r') as f:
#     model = json.load(f)

# new_model_path = os.path.join(output_folder, 'tokenizer.model')
# update_model_config(model, new_model_path)

# # Save the updated model configuration
# with open(model_config_file, 'w') as f:
#     json.dump(model, f, indent=4)

#logging.info(f"Updated model configuration saved to: {model_config_file}")


In [None]:
model.change_vocabulary("/home/ksingla/workspace/PromptingNemo/data_v2/synthetic/processed/tokenizer/", "bpe")

In [None]:
accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'
max_steps = 600000

trainer = Trainer(devices=1, accelerator=accelerator, max_steps=max_steps,
                  enable_checkpointing=False, logger=False,
                  log_every_n_steps=50, check_val_every_n_epoch=1, accumulate_grad_batches=8)

model.set_trainer(trainer)

In [None]:
# utility method
import json
from nemo.collections.asr.parts.utils.manifest_utils import read_manifest

In [None]:
with open_dict(model.cfg):
  # Train Dataloader
  model.cfg.train_ds.manifest_filepath = TRAIN_MANIFEST
  model.cfg.train_ds.batch_size = 12
  model.cfg.train_ds.is_tarred = False
  model.cfg.train_ds.tarred_audio_filepaths = None
  model.cfg.train_ds.num_workers = 8  # Adding num_workers for training dataloader

  model.cfg.validation_ds.manifest_filepath = TEST_MANIFEST
  model.cfg.validation_ds.batch_size = 12
  model.cfg.validation_ds.num_workers = 8  # Adding num_workers for training dataloader

model.setup_training_data(model.cfg.train_ds)
model.setup_multiple_validation_data(model.cfg.validation_ds)
model.setup_multiple_test_data(model.cfg.validation_ds)

In [None]:
with open_dict(model.cfg):
  # Spec Augment
  model.cfg.spec_augment.freq_masks = model.cfg.spec_augment.freq_masks  # Can be changed
  model.cfg.spec_augment.freq_width = model.cfg.spec_augment.freq_width  # Can be changed
  model.cfg.spec_augment.time_masks = model.cfg.spec_augment.time_masks  # Can be changed
  model.cfg.spec_augment.time_width = model.cfg.spec_augment.time_width  # Can be changed

model.spec_augmentation = model.from_config_dict(model.cfg.spec_augment)

In [None]:
if 'optim' in model.cfg:
  print(OmegaConf.to_yaml(model.cfg.optim))

In [None]:
with open_dict(model.cfg):
  model.cfg.optim.lr = 0.1
  model.cfg.optim.weight_decay = 0.0001
  model.cfg.optim.sched.warmup_steps = 1000

model.setup_optimization(model.cfg.optim);

In [None]:
# if hasattr(model, 'adapter_module_names'):
#   print(model.adapter_module_names)

In [None]:
# for module in model.children():
#   if hasattr(module, 'get_accepted_adapter_types'):
#     types = module.get_accepted_adapter_types()
#     print("Module : ", module.__class__.__name__)

#     for tp in types:
#       print(tp)
#     print()

In [None]:
# from nemo.collections.common.parts.adapter_modules import LinearAdapterConfig

In [None]:

# adapter_name = "AN4" #@param {type:"string"}
# adapter_dim = 32 #@param {type:"integer"}
# adapter_activation = "swish" #@param {type:"string"}
# adapter_norm_position = "pre" #@param ["pre", "post"]

In [None]:
# adapter_cfg = LinearAdapterConfig(
#     in_features=model.cfg.encoder.d_model,  # conformer specific model dim. Every layer emits this dim at its output.
#     dim=adapter_dim,  # the bottleneck dimension of the adapter
#     activation=adapter_activation,  # activation used in bottleneck block
#     norm_position=adapter_norm_position,  # whether to use LayerNorm at the beginning or the end of the adapter
# )
# print(adapter_cfg)

In [None]:
model.summarize()

In [None]:
# model.add_adapter(name=adapter_name, cfg=adapter_cfg)

In [None]:
model.summarize()

In [None]:
# model.set_enabled_adapters(enabled=False)  # disable all adapters
# model.set_enabled_adapters(name=adapter_name, enabled=True)  # enable only the current adapter we want to train

In [None]:
# model.freeze()
# model.unfreeze_enabled_adapters()
# #model.unfreeze()
# model.decoder.unfreeze()

In [None]:
model.summarize()

In [None]:
# Prepare NeMo's Experiment manager to handle checkpoint saving and logging for us
from nemo.utils import exp_manager

# Environment variable generally used for multi-node multi-gpu training.
# In notebook environments, this flag is unnecessary and can cause logs of multiple training runs to overwrite each other.
os.environ.pop('NEMO_EXPM_VERSION', None)

exp_config = exp_manager.ExpManagerConfig(
    exp_dir=f'/external2/karan_exp/experiments/',
    name=f"finetune-multidomain-alllang-100k-ctc",
    checkpoint_callback_params=exp_manager.CallbackParams(
        monitor="val_wer",
        mode="min",
        always_save_nemo=True,
        save_best_model=True,
    ),
)

exp_config = OmegaConf.structured(exp_config)

logdir = exp_manager.exp_manager(trainer, exp_config)

In [None]:
model.cfg['tokenizer']



In [None]:
model.summarize()

In [None]:
# Finally, train the adapters
trainer.fit(model)