# Libraries Installation & Imports 📦🔌

## Libraries Installations 🛠️
### Installing necessary libraries required for the notebook's operations.

In [None]:
!pip install wget
!apt-get -y install sox libsndfile1 ffmpeg
!pip install text-unidecode
!pip install matplotlib>=3.3.2
!pip install aiohttp==3.9.2
!pip install boto3 --upgrade

In [None]:
BRANCH = 'r2.0.0rc0'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH

## Importing Libraries 📚
### Importing the installed libraries into the notebook to use their functionalities.

In [None]:
import os
import nemo
import json
import wandb
import librosa
import pandas as pd
import nemo.collections.asr as nemo_asr
from omegaconf import OmegaConf, open_dict

# Hyper-Parameter Tuning 🎛️⚙️

## Tuning Tokenizer 🔧📝
### Adjusting the tokenizer settings to optimize its performance for the given task.

In [None]:
if not os.path.exists("scripts/tokenizers/process_asr_text_tokenizer.py"):
    !mkdir scripts
    !wget -P scripts/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tokenizers/process_asr_text_tokenizer.py

In [None]:
!python ./scripts/process_asr_text_tokenizer.py \
  --manifest='/kaggle/input/aic-manifests/train_manifest.json' \
  --data_root="/kaggle/working/tokinzers/sus" \
  --vocab_size=128 \
  --tokenizer="spe" \
  --no_lower_case \
  --spe_type="unigram" \
  --log

In [None]:
!mkdir configs
BRANCH = 'r2.0.0rc0'
!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/conf/conformer/conformer_ctc_bpe.yaml

In [None]:
# !git clone https://github.com/NVIDIA/NeMo.git

## Loading Folders to Working 📂➡️🏗️
### Setting up the directory structure and loading necessary folders for the project.

In [None]:
!cp -r /kaggle/input/sussy-baka/NeMo "/kaggle/working/"
!cp -r /kaggle/input/sussy-baka/configs "/kaggle/working/"
!cp -r /kaggle/input/sussy-baka/results "/kaggle/working/"
!cp -r /kaggle/input/sussy-baka/scripts "/kaggle/working/"
!cp -r /kaggle/input/sussy-baka/tokinzers "/kaggle/working/"

## Tuning conformer_ctc_bpe 🔧🧩
### Fine-tuning the Conformer CTC model with BPE (Byte-Pair Encoding) to enhance its performance.

In [None]:
params = OmegaConf.load("/kaggle/working/configs/conformer_ctc_bpe.yaml")

In [None]:
# configuration updates
params.trainer.precision=32 # this set precision of the model to values [32,16]
params.model.encoder.dropout = 0.2
params.model.encoder.dropout_pre_encoder = 0.2
params.model.encoder.dropout_emb = 0.1
params.model.train_ds.batch_size=4

In [None]:
OmegaConf.save(params, "/kaggle/working/NeMo/examples/asr/conf/conformer/conformer_ctc_bpe.yaml")

## Opening WanDB 🌐📊
### Initiating a connection with Weights and Biases (WanDB) for experiment tracking and visualization.

In [None]:
WANDB_API = 'ffbdfe896293a7e939b6775de3fe55e2abd1a0fc'
wandb.login(key=WANDB_API)

## Adding FAdam ➕⚡
### Integrating the FAdam optimizer to improve the training efficiency of the model.

In [None]:
write = '''import types
from abc import ABC, abstractmethod
from typing import List, Optional

import pytorch_lightning as L
from pytorch_lightning.utilities.types import OptimizerLRScheduler
from torch.optim import Optimizer

from nemo.lightning.megatron_parallel import CallbackMethods


class LRSchedulerModule(L.Callback, CallbackMethods, ABC):
    """A module to standardize the learning rate scheduler setup and configuration.

    This class decouples the learning rate scheduler from the model, similar to how the LightningDataModule
    decouples data handling. It also acts as a Callback to hook into the training loop, which can be useful
    for adding custom all-reduces, logging, early stopping, etc. Next to that standard Lightning callback-event,
    this also supports hooking into the Megatron forward-backward function at a granular level.

    Example::

        class MyLRSchedulerModule(LRSchedulerModule):
            def setup(self, model, optimizer):
                # Custom setup logic
                ...

            def scheduler(self, model, optimizers):
                # Define and return the learning rate scheduler
                ...

    Methods:
        setup(model, optimizer): Sets up the learning rate scheduler.
        scheduler(model, optimizers): Abstract method to define the learning rate scheduler.
        __call__(model, optimizers): Calls the setup and scheduler methods.
    """

    def connect(self, model, optimizer) -> None:
        """Sets up the learning rate scheduler.

        Args:
            model: The model for which the scheduler is being set up.
            optimizer: The optimizer for which the scheduler is being set up.
        """
        ...

    @abstractmethod
    def scheduler(self, model, optimizers) -> OptimizerLRScheduler:
        """Abstract method to define the learning rate scheduler.

        Args:
            model: The model for which the scheduler is being defined.
            optimizers: The optimizers for which the scheduler is being defined.

        Returns:
            OptimizerLRScheduler: The learning rate scheduler.
        """
        raise NotImplementedError("The scheduler method should be implemented by subclasses.")

    def __call__(self, model, optimizers):
        """Calls the setup and scheduler methods.

        Args:
            model: The model for which the scheduler is being called.
            optimizers: The optimizers for which the scheduler is being called.

        Returns:
            OptimizerLRScheduler: The learning rate scheduler.
        """

        self.connect(model, optimizers)

        self._scheduler = self.scheduler(model, optimizers)

        if not isinstance(self._scheduler, (dict, tuple)):
            return optimizers, self._scheduler

        return self._scheduler


class OptimizerModule(L.Callback, CallbackMethods, ABC):
    """A module to standardize the optimizer setup and configuration.

    This class decouples the optimizer from the model, similar to how the LightningDataModule
    decouples data handling. It also acts as a Callback to hook into the training loop, which can be useful
    for adding custom all-reduces, logging, early stopping, etc. Next to that standard Lightning callback-event,
    this also supports hooking into the Megatron forward-backward function at a granular level.

    Attributes:
        lr_scheduler (Optional[LRSchedulerModule]): The learning rate scheduler module.

    Example::

        class MyOptimizerModule(OptimizerModule):
            def __init__(self, lr_scheduler=None):
                super().__init__(lr_scheduler)

            def setup(self, model):
                # Custom setup logic
                ...

            def optimizers(self, model):
                # Define and return the optimizers
                ...

    Methods:
        connect(model, trainer): Connects the optimizer module to the model and trainer.
        setup(model): Sets up the optimizer.
        optimizers(model): Abstract method to define the optimizers.
        __call__(model, megatron_parallel): Calls the setup and optimizers methods.
    """

    def __init__(self, lr_scheduler: Optional[LRSchedulerModule]):
        """Initializes the OptimizerModule.

        Args:
            lr_scheduler (Optional[LRSchedulerModule]): The learning rate scheduler module.
        """
        self.lr_scheduler = lr_scheduler

    def connect(self, model: L.LightningModule) -> None:
        """Connects the optimizer module to the model and trainer.

        Args:
            model (L.LightningModule): The model to which the optimizer module is being connected.
        """

        def custom_configure_optimizers(lightning_module_self, megatron_parallel=None):
            opt = self(lightning_module_self, megatron_parallel=megatron_parallel)
            return opt

        model.configure_optimizers = types.MethodType(custom_configure_optimizers, model)
        model.optim = self

        if hasattr(self, "__io__") and hasattr(model, "__io__"):
            if hasattr(model.__io__, "optim"):
                model.__io__.optim = self.__io__

    @abstractmethod
    def optimizers(self, model) -> List[Optimizer]:
        """Abstract method to define the optimizers.

        Args:
            model: The model for which the optimizers are being defined.

        Returns:
            List[Optimizer]: The list of optimizers.
        """
        raise NotImplementedError("The optimizers method should be implemented by subclasses.")

    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx) -> None:
        if self._optimizers is not None:
            lr = self._optimizers[0].param_groups[0]['lr']
            pl_module.log('lr', lr, rank_zero_only=True, batch_size=1)

    def __call__(self, model: L.LightningModule, megatron_parallel=None) -> OptimizerLRScheduler:
        """Calls the setup and optimizers methods.

        Args:
            model (L.LightningModule): The model for which the optimizers are being called.
            megatron_parallel: Optional parallel model.

        Returns:
            OptimizerLRScheduler: The optimizers and optionally the learning rate scheduler.
        """
        _model = model if megatron_parallel is None else megatron_parallel
        callbacks = _model.trainer.callbacks
        if self not in callbacks:
            callbacks.append(self)
        if self.lr_scheduler is not None and self.lr_scheduler not in callbacks:
            callbacks.append(self.lr_scheduler)

        self._optimizers = self.optimizers(_model)

        _opt = self._optimizers[0] if len(self._optimizers) == 1 else self._optimizers

        if self.lr_scheduler is not None:
            with_scheduler = self.lr_scheduler(_model, _opt)

            return with_scheduler

        return self._optimizers


class MyOptimizerModule(OptimizerModule):
    def optimizers(self, model) -> List[Optimizer]:
        optimizer = fAdam(model.parameters(), lr=5.0, weight_decay=5e-4, betas=(0.9, 0.98))
        print("From fAdam")
        return [optimizer]
        '''

with open("/kaggle/working/NeMo/nemo/lightning/pytorch/optim/base.py", "w") as file:
    file.write(write)

In [None]:
write = '''from typing import Any, Callable, List, Mapping, Optional

import pytorch_lightning as pl
from megatron.core.distributed import finalize_model_grads
from megatron.core.optimizer import OptimizerConfig, get_megatron_optimizer
from megatron.core.utils import get_model_config
from torch.optim import Optimizer

from nemo.lightning.megatron_parallel import MegatronParallel
from nemo.lightning.pytorch.optim.base import LRSchedulerModule, MyOptimizerModule


class MegatronOptimizerModule(MyOptimizerModule):
    """A MyOptimizerModule for the megatron optimizers.

    Attributes:
        config (OptimizerConfig): Configuration for the optimizer.
        no_weight_decay_cond (Optional[Callable]): Condition for no weight decay.
        scale_lr_cond (Optional[Callable]): Condition for scaling learning rate.
        lr_mult (float): Learning rate multiplier.

    Example::

        config = OptimizerConfig(...)
        lr_scheduler = MyLRSchedulerModule(...)
        optimizer_module = MegatronOptimizerModule(config, lr_scheduler)

    Methods:
        setup(model): Sets up the optimizer.
        optimizers(model): Defines the optimizers.
    """

    def __init__(
        self,
        config: OptimizerConfig,
        lr_scheduler: Optional[LRSchedulerModule] = None,
        no_weight_decay_cond: Optional[Callable] = None,
        scale_lr_cond: Optional[Callable] = None,
        lr_mult: float = 1.0,
    ):
        """Initializes the MegatronOptimizerModule.

        Args:
            config (OptimizerConfig): Configuration for the optimizer.
            lr_scheduler (Optional[LRSchedulerModule]): The learning rate scheduler module.
            no_weight_decay_cond (Optional[Callable]): Condition for no weight decay.
            scale_lr_cond (Optional[Callable]): Condition for scaling learning rate.
            lr_mult (float): Learning rate multiplier.
        """

        super().__init__(lr_scheduler=lr_scheduler)
        self.config = config
        self.no_weight_decay_cond = no_weight_decay_cond
        self.scale_lr_cond = scale_lr_cond
        self.lr_mult = lr_mult

    def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: str):
        """We will add the finalize_model_grads function to the model config.

        Args:
            model: The model for which the optimizer is being set up.
        """

        def finalize_model_grads_func(*args, **kwargs):
            return self.finalize_model_grads(*args, **kwargs)

        get_model_config(pl_module).finalize_model_grads_func = finalize_model_grads_func

    def optimizers(self, model: MegatronParallel) -> List[Optimizer]:
        """Defines the optimizers.

        Args:
            model (MegatronParallel): The model for which the optimizers are being defined.

        Returns:
            List[Optimizer]: The list of optimizers.

        Raises:
            ValueError: If the model is not an instance of MegatronParallel.
        """

        if not isinstance(model, MegatronParallel):
            raise ValueError("Model must be an instance of MegatronParallel")

        from nemo.core.optim import McoreDistributedOptimizer

        class McoreOpt(McoreDistributedOptimizer):
            def sharded_state_dict(
                self,
                model_sharded_state_dict,
                optimizer_state_dict=None,
                is_loading=False,
                # dist_ckpt_parallel_save=False, ## TODO: fix!
            ):
                # sharding_type = 'fully_sharded_model_space' if dist_ckpt_parallel_save else 'dp_zero_gather_scatter'
                sharding_type = 'dp_zero_gather_scatter'
                state_dict = self.mcore_optimizer.sharded_state_dict(
                    model_sharded_state_dict, is_loading=is_loading, sharding_type=sharding_type
                )
                return state_dict

        mcore_opt = get_megatron_optimizer(
            self.config,
            list(model),
            no_weight_decay_cond=self.no_weight_decay_cond,
            scale_lr_cond=self.scale_lr_cond,
            lr_mult=self.lr_mult,
        )

        return [McoreOpt(mcore_opt)]

    def finalize_model_grads(self, *args, **kwargs):
        return finalize_model_grads(*args, **kwargs)
        '''

with open("/kaggle/working/NeMo/nemo/lightning/pytorch/optim/megatron.py", "w") as file:
    file.write(write)

In [None]:
write = '''from typing import Union

from lightning_fabric.plugins.environments import slurm
from pytorch_lightning import plugins as _pl_plugins

# This is here to import it once, which improves the speed of launch when in debug-mode
try:
    import transformer_engine  # noqa
except ImportError:
    pass

from nemo.lightning.base import get_vocab_size, teardown
from nemo.lightning.nemo_logger import NeMoLogger
from nemo.lightning.pytorch.callbacks.megatron_model_checkpoint import ModelCheckpoint
from nemo.lightning.pytorch.optim import LRSchedulerModule, MegatronOptimizerModule, MyOptimizerModule
from nemo.lightning.pytorch.plugins import MegatronDataSampler, MegatronMixedPrecision
from nemo.lightning.pytorch.plugins import data_sampler as _data_sampler
from nemo.lightning.pytorch.strategies import MegatronStrategy
from nemo.lightning.pytorch.trainer import Trainer
from nemo.lightning.resume import AutoResume


# We monkey patch because nvidia uses a naming convention for SLURM jobs
def _is_slurm_interactive_mode():
    job_name = slurm.SLURMEnvironment.job_name()
    return job_name is None or job_name.endswith("bash") or job_name.endswith("interactive")


slurm._is_slurm_interactive_mode = _is_slurm_interactive_mode  # noqa: SLF001


_pl_plugins._PLUGIN_INPUT = Union[_pl_plugins._PLUGIN_INPUT, _data_sampler.DataSampler]  # noqa: SLF001


__all__ = [
    "AutoResume",
    "LRSchedulerModule",
    "MegatronStrategy",
    "MegatronDataSampler",
    "MegatronMixedPrecision",
    "MegatronOptimizerModule",
    "NeMoLogger",
    "ModelCheckpoint",
    "MyOptimizerModule",
    "Trainer",
    "get_vocab_size",
    "teardown",
]
        '''

with open("/kaggle/working/NeMo/nemo/lightning/__init__.py", "w") as file:
    file.write(write)

In [None]:
write = '''from pathlib import Path
from typing import Callable, Optional

import pytorch_lightning as pl
from typing_extensions import Annotated

from nemo.collections.llm.utils import Config, task
from nemo.lightning import AutoResume, MegatronStrategy, NeMoLogger, MyOptimizerModule, Trainer, io, teardown


@task(namespace="llm")
def train(
    model: pl.LightningModule,
    data: pl.LightningDataModule,
    trainer: Trainer,
    log: Annotated[Optional[NeMoLogger], Config[NeMoLogger]] = None,
    resume: Annotated[Optional[AutoResume], Config[AutoResume]] = None,
    optim: Optional[MyOptimizerModule] = None,
    tokenizer: Optional[str] = None,
    # TODO: Fix export export: Optional[str] = None,
) -> Path:
    """
    Trains a model using the specified data and trainer, with optional tokenizer, source, and export.

    Args:
        model (pl.LightningModule): The model to be trained.
        data (pl.LightningDataModule): The data module containing training data.
        trainer (Trainer): The trainer instance configured with a MegatronStrategy.
        log (NeMoLogger): A nemologger instance.
        resume (Optional[Union[AutoResume, Resume]]): Resume training from a checkpoint.
        optim (Optional[OptimizerModule]): The optimizer module to be used. If not provided, the default optimizer
            from the model will be used.
        tokenizer (Optional[str]): Tokenizer setting to be applied. Can be 'data' or 'model'.
        export (Optional[str]): Filename to save the exported checkpoint after training.

    Returns
    -------
        Path: The directory path where training artifacts are saved.

    Raises
    ------
        ValueError: If the trainer's strategy is not MegatronStrategy.

    Examples
    --------
        >>> model = MyModel()
        >>> data = MyDataModule()
        >>> trainer = Trainer(strategy=MegatronStrategy())
        >>> train(model, data, trainer, tokenizer='data', source='path/to/ckpt.ckpt', export='final.ckpt')
        PosixPath('/path/to/log_dir')
    """
    _log = log or NeMoLogger()
    app_state = _log.setup(
        trainer,
        resume_if_exists=getattr(resume, "resume_if_exists", False),
        task_config=getattr(train, "__io__", None),
    )
    if resume is not None:
        resume.setup(model, trainer)
    if optim:
        optim.connect(model)
    if tokenizer:  # TODO: Improve this
        _use_tokenizer(model, data, tokenizer)

    trainer.fit(model, data)

    _log.teardown()

    return app_state.exp_dir


@task(namespace="llm")
def pretrain(
    model: pl.LightningModule,
    data: pl.LightningDataModule,
    trainer: Trainer,
    source: Optional[str] = None,
    # export: Optional[str] = None
) -> Path:
    return train(model=model, data=data, trainer=trainer, tokenizer="data", source=source)


@task(namespace="llm")
def validate(
    model: pl.LightningModule,
    data: pl.LightningDataModule,
    trainer: Trainer,
    tokenizer: Optional[str] = None,
    source: Optional[str] = None,
    export: Optional[str] = None,
) -> Path:
    if not isinstance(trainer.strategy, MegatronStrategy):
        raise ValueError("Only MegatronStrategy is supported")

    validate_kwargs = {}
    run_dir = Path(trainer.logger.log_dir)
    export_dir = run_dir / "export"

    if tokenizer:  # TODO: Improve this
        _use_tokenizer(model, data, tokenizer)
    if source:
        _add_ckpt_path(source, model, validate_kwargs)

    trainer.validate(model, data, **validate_kwargs)
    trainer.save_checkpoint(export_dir)
    if export:
        teardown(trainer)
        del trainer, model, data
        export_ckpt(export_dir, export)

    return run_dir


@task(name="import", namespace="llm")
def import_ckpt(
    model: pl.LightningModule,
    source: str,
    output_path: Optional[Path] = None,
    overwrite: bool = False,
) -> Path:
    return io.import_ckpt(model=model, source=source, output_path=output_path, overwrite=overwrite)


def load_connector_from_trainer_ckpt(path: Path, target: str) -> io.ModelConnector:
    return io.load_ckpt(path).model.exporter(target, path)


@task(name="export", namespace="llm")
def export_ckpt(
    path: Path,
    target: str,
    output_path: Optional[Path] = None,
    overwrite: bool = False,
    load_connector: Callable[[Path, str], io.ModelConnector] = load_connector_from_trainer_ckpt,
) -> Path:
    return io.export_ckpt(path, target, output_path, overwrite, load_connector)


def _use_tokenizer(model: pl.LightningModule, data: pl.LightningDataModule, tokenizer: str) -> None:
    if tokenizer == "data":
        model.tokenizer = data.tokenizer
    elif tokenizer == "model":
        data.tokenizer = model.tokenizer


def _add_ckpt_path(source, model, kwargs) -> None:
    if io.is_distributed_ckpt(source):
        kwargs["ckpt_path"] = source
    else:
        kwargs["ckpt_path"] = model.import_ckpt(source)


def _save_config_img(*args, **kwargs):
    try:
        from nemo_sdk.utils import save_config_img

        save_config_img(*args, **kwargs)
    except ImportError:
        pass
        '''

with open("/kaggle/working/NeMo/nemo/collections/llm/api.py", "w") as file:
    file.write(write)

In [None]:
write = '''
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Annotated, Callable, Optional

import torch
import torch.nn.functional as F

from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel
from nemo.collections.llm.utils import Config
from nemo.lightning import MyOptimizerModule, io, teardown

if TYPE_CHECKING:
    from transformers import LlamaConfig as HFLlamaConfig
    from transformers import LlamaForCausalLM

    from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
    from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec


# Note: these Llama configs are copied from the corresponding HF model. You may need to modify the parameter for
# your own needs, in particular: seq_length and rotary_base.
@dataclass
class LlamaConfig(GPTConfig):
    # configs that are common across model sizes
    normalization: str = "RMSNorm"
    activation_func: Callable = F.silu
    gated_linear_unit: bool = True
    position_embedding_type: str = "rope"
    add_bias_linear: bool = False
    seq_length: int = 4096


@dataclass
class Llama2Config7B(LlamaConfig):
    num_layers: int = 32
    hidden_size: int = 4096
    num_attention_heads: int = 32
    num_query_groups: int = 32
    ffn_hidden_size: int = 11008


@dataclass
class Llama2Config13B(LlamaConfig):
    num_layers: int = 40
    hidden_size: int = 5120
    num_attention_heads: int = 40
    num_query_groups: int = 40
    ffn_hidden_size: int = 13824


@dataclass
class Llama2Config70B(LlamaConfig):
    num_layers: int = 80
    hidden_size: int = 8192
    num_attention_heads: int = 64
    num_query_groups: int = 8
    ffn_hidden_size: int = 28672


@dataclass
class Llama3Config8B(Llama2Config7B):
    seq_length: int = 8192
    num_query_groups: int = 8
    ffn_hidden_size: int = 14336


@dataclass
class Llama3Config70B(Llama2Config70B):
    seq_length: int = 8192


@dataclass
class CodeLlamaConfig7B(Llama2Config7B):
    rotary_base: int = 1_000_000
    seq_length: int = 16384


@dataclass
class CodeLlamaConfig13B(Llama2Config13B):
    rotary_base: int = 1_000_000
    seq_length: int = 16384


@dataclass
class CodeLlamaConfig34B(LlamaConfig):
    num_layers: int = 48
    hidden_size: int = 8192
    num_attention_heads: int = 64
    num_query_groups: int = 8
    ffn_hidden_size: int = 22016
    rotary_base: int = 1_000_000
    seq_length: int = 16384


@dataclass
class CodeLlamaConfig70B(Llama2Config70B):
    pass


class LlamaModel(GPTModel):
    def __init__(
        self,
        config: Annotated[Optional[LlamaConfig], Config[LlamaConfig]] = None,
        optim: Optional[MyOptimizerModule] = None,
        tokenizer: Optional["TokenizerSpec"] = None,
    ):
        super().__init__(config or LlamaConfig(), optim=optim, tokenizer=tokenizer)


@io.model_importer(LlamaModel, "hf")
class HFLlamaImporter(io.ModelConnector["LlamaForCausalLM", LlamaModel]):
    def init(self) -> LlamaModel:
        return LlamaModel(self.config, tokenizer=self.tokenizer)

    def apply(self, output_path: Path) -> Path:
        from transformers import LlamaForCausalLM

        source = LlamaForCausalLM.from_pretrained(str(self))
        target = self.init()
        trainer = self.nemo_setup(target)
        self.convert_state(source, target)
        self.nemo_save(output_path, trainer)

        print(f"Converted Llama model to Nemo, model saved to {output_path}")

        teardown(trainer, target)
        del trainer, target

        return output_path

    def convert_state(self, source, target):
        mapping = {
            "model.embed_tokens.weight": "embedding.word_embeddings.weight",
            "model.layers.*.self_attn.o_proj.weight": "decoder.layers.*.self_attention.linear_proj.weight",
            "model.layers.*.mlp.down_proj.weight": "decoder.layers.*.mlp.linear_fc2.weight",
            "model.layers.*.input_layernorm.weight": "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight",
            "model.layers.*.post_attention_layernorm.weight": "decoder.layers.*.mlp.linear_fc1.layer_norm_weight",
            "model.norm.weight": "decoder.final_layernorm.weight",
            "lm_head.weight": "output_layer.weight",
        }

        return io.apply_transforms(source, target, mapping=mapping, transforms=[_import_qkv, _import_linear_fc1])

    @property
    def tokenizer(self) -> "AutoTokenizer":
        from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer

        return AutoTokenizer(str(self))

    @property
    def config(self) -> LlamaConfig:
        from transformers import LlamaConfig as HFLlamaConfig

        source = HFLlamaConfig.from_pretrained(str(self))

        def make_vocab_size_divisible_by(vocab_size):
            base = 128
            while vocab_size % base != 0:
                base //= 2
            return base

        output = LlamaConfig(
            num_layers=source.num_hidden_layers,
            hidden_size=source.hidden_size,
            ffn_hidden_size=source.intermediate_size,
            num_attention_heads=source.num_attention_heads,
            init_method_std=source.initializer_range,
            layernorm_epsilon=source.rms_norm_eps,
            num_query_groups=source.num_key_value_heads,
            rotary_base=source.rope_theta,
            gated_linear_unit=True,
            make_vocab_size_divisible_by=make_vocab_size_divisible_by(source.vocab_size),
            share_embeddings_and_output_weights=False,
        )

        return output


@io.model_exporter(LlamaModel, "hf")
class HFLlamaExporter(io.ModelConnector[LlamaModel, "LlamaForCausalLM"]):
    def init(self) -> "LlamaForCausalLM":
        from transformers import AutoModelForCausalLM

        return AutoModelForCausalLM.from_config(self.config)

    def apply(self, output_path: Path) -> Path:
        target = self.init()
        source, _ = self.nemo_load(str(self))
        target = self.convert_state(source, target)

        target = target.cpu()
        target.save_pretrained(output_path)
        self.tokenizer.save_pretrained(output_path)

        return output_path

    def convert_state(self, source, target):
        mapping = {
            "embedding.word_embeddings.weight": "model.embed_tokens.weight",
            "decoder.layers.*.self_attention.linear_proj.weight": "model.layers.*.self_attn.o_proj.weight",
            "decoder.layers.*.mlp.linear_fc2.weight": "model.layers.*.mlp.down_proj.weight",
            "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "model.layers.*.input_layernorm.weight",
            "decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "model.layers.*.post_attention_layernorm.weight",
            "decoder.final_layernorm.weight": "model.norm.weight",
            "output_layer.weight": "lm_head.weight",
        }

        return io.apply_transforms(source, target, mapping=mapping, transforms=[_export_qkv, _export_linear_fc1])

    @property
    def tokenizer(self):
        return io.load_ckpt(str(self)).model.tokenizer.tokenizer

    @property
    def config(self) -> "HFLlamaConfig":
        source: LlamaConfig = io.load_ckpt(str(self)).model.config

        from transformers import LlamaConfig as HFLlamaConfig

        return HFLlamaConfig(
            num_hidden_layers=source.num_layers,
            hidden_size=source.hidden_size,
            intermediate_size=source.ffn_hidden_size,
            num_attention_heads=source.num_attention_heads,
            max_position_embeddings=source.seq_length,
            initializer_range=source.init_method_std,
            rms_norm_eps=source.layernorm_epsilon,
            num_key_value_heads=source.num_query_groups,
            rope_theta=source.rotary_base,
            vocab_size=self.tokenizer.vocab_size,
        )


@io.state_transform(
    source_key=(
        "model.layers.*.self_attn.q_proj.weight",
        "model.layers.*.self_attn.k_proj.weight",
        "model.layers.*.self_attn.v_proj.weight",
    ),
    target_key="decoder.layers.*.self_attention.linear_qkv.weight",
)
def _import_qkv(ctx: io.TransformCTX, q, k, v):
    megatron_config = ctx.target.config

    head_num = megatron_config.num_attention_heads
    num_query_groups = megatron_config.num_query_groups
    heads_per_group = head_num // num_query_groups
    hidden_size = megatron_config.hidden_size
    head_num = megatron_config.num_attention_heads
    head_size = hidden_size // head_num

    old_tensor_shape = q.size()
    new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
    new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:]

    q = q.view(*new_q_tensor_shape)
    k = k.view(*new_kv_tensor_shape)
    v = v.view(*new_kv_tensor_shape)

    qkv_weights_l = []
    for i in range(num_query_groups):
        qkv_weights_l.append(q[i * heads_per_group : (i + 1) * heads_per_group, :, :])
        qkv_weights_l.append(k[i : i + 1, :, :])
        qkv_weights_l.append(v[i : i + 1, :, :])
    qkv_weights = torch.cat(qkv_weights_l)
    assert qkv_weights.ndim == 3, qkv_weights.shape
    assert qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups, qkv_weights.shape
    assert qkv_weights.shape[1] == head_size, qkv_weights.shape
    assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape

    qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])

    return qkv_weights


@io.state_transform(
    source_key="decoder.layers.*.self_attention.linear_qkv.weight",
    target_key=(
        "model.layers.*.self_attn.q_proj.weight",
        "model.layers.*.self_attn.k_proj.weight",
        "model.layers.*.self_attn.v_proj.weight",
    ),
)
def _export_qkv(ctx: io.TransformCTX, linear_qkv):
    megatron_config = ctx.source.config

    head_num = megatron_config.num_attention_heads
    num_query_groups = megatron_config.num_query_groups
    heads_per_group = head_num // num_query_groups
    hidden_size = megatron_config.hidden_size
    head_num = megatron_config.num_attention_heads
    head_size = hidden_size // head_num
    qkv_total_dim = head_num + 2 * num_query_groups

    linear_qkv = linear_qkv.reshape([qkv_total_dim, head_size, hidden_size])
    q_slice = torch.cat(
        [
            torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
            for i in range(num_query_groups)
        ]
    )
    k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
    v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))

    q_proj = linear_qkv[q_slice].reshape(-1, hidden_size).cpu()
    k_proj = linear_qkv[k_slice].reshape(-1, hidden_size).cpu()
    v_proj = linear_qkv[v_slice].reshape(-1, hidden_size).cpu()

    return q_proj, k_proj, v_proj


@io.state_transform(
    source_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"),
    target_key="decoder.layers.*.mlp.linear_fc1.weight",
)
def _import_linear_fc1(down, gate):
    return torch.cat((down, gate), axis=0).float()


@io.state_transform(
    source_key="decoder.layers.*.mlp.linear_fc1.weight",
    target_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"),
)
def _export_linear_fc1(linear_fc1):
    gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0)

    return gate_proj, up_proj


__all__ = [
    "LlamaConfig",
    "Llama2Config7B",
    "Llama2Config13B",
    "Llama2Config70B",
    "Llama3Config8B",
    "Llama3Config70B",
    "CodeLlamaConfig7B",
    "CodeLlamaConfig13B",
    "CodeLlamaConfig34B",
    "CodeLlamaConfig70B",
    "LlamaModel",
]
        '''

with open("/kaggle/working/NeMo/nemo/collections/llm/gpt/model/llama.py", "w") as file:
    file.write(write)

In [None]:
write = '''from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Annotated, Callable, Optional

import torch

from nemo.collections.llm.fn.activation import openai_gelu
from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel
from nemo.collections.llm.utils import Config
from nemo.lightning import MyOptimizerModule, io, teardown

if TYPE_CHECKING:
    from transformers import GemmaForCausalLM

    from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
    from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec


# Note: Gemma requires huggingface transformers >= 4.38
# Note: these Gemma configs are copied from the corresponding HF model. You may need to modify the parameter for
# your own needs, in particular: seq_length and rotary_base.
@dataclass
class GemmaConfig(GPTConfig):
    # configs that are common across model sizes
    normalization: str = "RMSNorm"
    activation_func: Callable = openai_gelu
    gated_linear_unit: bool = True
    position_embedding_type: str = "rope"
    add_bias_linear: bool = False
    seq_length: int = 8192
    kv_channels: int = 256
    share_embeddings_and_output_weights: bool = True
    # Note: different behavior compared to Legacy NeMo
    # Legacy NeMo does not set layernorm_zero_centered_gamma and instead adds 1 in the HF -> NeMo conversion script
    # The present implementation is more in line with the official implementation
    layernorm_zero_centered_gamma: bool = True


@dataclass
class GemmaConfig2B(GemmaConfig):
    num_layers: int = 18
    hidden_size: int = 2048
    num_attention_heads: int = 8
    num_query_groups: int = 1
    ffn_hidden_size: int = 16384


@dataclass
class GemmaConfig7B(GemmaConfig):
    num_layers: int = 28
    hidden_size: int = 3072
    num_attention_heads: int = 16
    num_query_groups: int = 16
    ffn_hidden_size: int = 24576


class CodeGemmaConfig2B(GemmaConfig2B):
    pass


class CodeGemmaConfig7B(GemmaConfig7B):
    pass


class GemmaModel(GPTModel):
    def __init__(
        self,
        config: Annotated[Optional[GemmaConfig], Config[GemmaConfig]] = None,
        optim: Optional[MyOptimizerModule] = None,
        tokenizer: Optional["TokenizerSpec"] = None,
    ):
        super().__init__(config or GemmaConfig(), optim=optim, tokenizer=tokenizer)


@io.model_importer(GemmaModel, "hf")
class HFGemmaImporter(io.ModelConnector["GemmaForCausalLM", GemmaModel]):
    def init(self) -> GemmaModel:
        return GemmaModel(self.config, tokenizer=self.tokenizer)

    def apply(self, output_path: Path) -> Path:
        from transformers import GemmaForCausalLM

        source = GemmaForCausalLM.from_pretrained(str(self))
        target = self.init()
        trainer = self.nemo_setup(target)
        self.convert_state(source, target)
        self.nemo_save(output_path, trainer)

        print(f"Converted Gemma model to Nemo, model saved to {output_path}")

        teardown(trainer, target)
        del trainer, target

        return output_path

    def convert_state(self, source, target):
        mapping = {
            "model.embed_tokens.weight": "embedding.word_embeddings.weight",
            "model.layers.*.self_attn.o_proj.weight": "decoder.layers.*.self_attention.linear_proj.weight",
            "model.layers.*.mlp.down_proj.weight": "decoder.layers.*.mlp.linear_fc2.weight",
            "model.layers.*.input_layernorm.weight": "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight",
            "model.layers.*.post_attention_layernorm.weight": "decoder.layers.*.mlp.linear_fc1.layer_norm_weight",
            "model.norm.weight": "decoder.final_layernorm.weight",
        }

        return io.apply_transforms(source, target, mapping=mapping, transforms=[_import_qkv, _import_linear_fc1])

    @property
    def tokenizer(self) -> "AutoTokenizer":
        from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer

        return AutoTokenizer(str(self))

    @property
    def config(self) -> GemmaConfig:
        from transformers import GemmaConfig as HFGemmaConfig

        source = HFGemmaConfig.from_pretrained(str(self))

        def make_vocab_size_divisible_by(vocab_size):
            base = 128
            while vocab_size % base != 0:
                base //= 2
            return base

        output = GemmaConfig(
            num_layers=source.num_hidden_layers,
            hidden_size=source.hidden_size,
            ffn_hidden_size=source.intermediate_size,
            num_attention_heads=source.num_attention_heads,
            init_method_std=source.initializer_range,
            layernorm_epsilon=source.rms_norm_eps,
            num_query_groups=source.num_key_value_heads,
            rotary_base=source.rope_theta,
            gated_linear_unit=True,
            make_vocab_size_divisible_by=make_vocab_size_divisible_by(source.vocab_size),
            share_embeddings_and_output_weights=False,
        )

        return output


@io.model_exporter(GemmaModel, "hf")
class HFGemmaExporter(io.ModelConnector[GemmaModel, "GemmaForCausalLM"]):
    def init(self) -> "GemmaForCausalLM":
        from transformers import AutoModelForCausalLM

        return AutoModelForCausalLM.from_config(self.config)

    def apply(self, output_path: Path) -> Path:
        target = self.init()
        source, _ = self.nemo_load(str(self))
        target = self.convert_state(source, target)

        target = target.cpu()
        target.save_pretrained(output_path)
        self.tokenizer.save_pretrained(output_path)

        return output_path

    def convert_state(self, source, target):
        mapping = {
            "embedding.word_embeddings.weight": "model.embed_tokens.weight",
            "decoder.layers.*.self_attention.linear_proj.weight": "model.layers.*.self_attn.o_proj.weight",
            "decoder.layers.*.mlp.linear_fc2.weight": "model.layers.*.mlp.down_proj.weight",
            "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "model.layers.*.input_layernorm.weight",
            "decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "model.layers.*.post_attention_layernorm.weight",
            "decoder.final_layernorm.weight": "model.norm.weight",
        }

        return io.apply_transforms(source, target, mapping=mapping, transforms=[_export_qkv, _export_linear_fc1])

    @property
    def tokenizer(self):
        return io.load_ckpt(str(self)).model.tokenizer.tokenizer

    @property
    def config(self) -> "GemmaConfig":
        source: GemmaConfig = io.load_ckpt(str(self)).model.config

        from transformers import GemmaConfig as HFGemmaConfig

        return HFGemmaConfig(
            num_hidden_layers=source.num_layers,
            hidden_size=source.hidden_size,
            intermediate_size=source.ffn_hidden_size,
            num_attention_heads=source.num_attention_heads,
            max_position_embeddings=source.seq_length,
            initializer_range=source.init_method_std,
            rms_norm_eps=source.layernorm_epsilon,
            num_key_value_heads=source.num_query_groups,
            vocab_size=self.tokenizer.vocab_size,
        )


@io.state_transform(
    source_key=(
        "model.layers.*.self_attn.q_proj.weight",
        "model.layers.*.self_attn.k_proj.weight",
        "model.layers.*.self_attn.v_proj.weight",
    ),
    target_key="decoder.layers.*.self_attention.linear_qkv.weight",
)
def _import_qkv(ctx: io.TransformCTX, q, k, v):
    megatron_config = ctx.target.config

    head_num = megatron_config.num_attention_heads
    num_query_groups = megatron_config.num_query_groups
    heads_per_group = head_num // num_query_groups
    hidden_size = megatron_config.hidden_size
    head_num = megatron_config.num_attention_heads
    head_size = hidden_size // head_num

    old_tensor_shape = q.size()
    new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
    new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:]

    q = q.view(*new_q_tensor_shape)
    k = k.view(*new_kv_tensor_shape)
    v = v.view(*new_kv_tensor_shape)

    qkv_weights_l = []
    for i in range(num_query_groups):
        qkv_weights_l.append(q[i * heads_per_group : (i + 1) * heads_per_group, :, :])
        qkv_weights_l.append(k[i : i + 1, :, :])
        qkv_weights_l.append(v[i : i + 1, :, :])
    qkv_weights = torch.cat(qkv_weights_l)
    assert qkv_weights.ndim == 3, qkv_weights.shape
    assert qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups, qkv_weights.shape
    assert qkv_weights.shape[1] == head_size, qkv_weights.shape
    assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape

    qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])

    return qkv_weights


@io.state_transform(
    source_key="decoder.layers.*.self_attention.linear_qkv.weight",
    target_key=(
        "model.layers.*.self_attn.q_proj.weight",
        "model.layers.*.self_attn.k_proj.weight",
        "model.layers.*.self_attn.v_proj.weight",
    ),
)
def _export_qkv(ctx: io.TransformCTX, linear_qkv):
    megatron_config = ctx.source.config

    head_num = megatron_config.num_attention_heads
    num_query_groups = megatron_config.num_query_groups
    heads_per_group = head_num // num_query_groups
    hidden_size = megatron_config.hidden_size
    head_num = megatron_config.num_attention_heads
    head_size = hidden_size // head_num
    qkv_total_dim = head_num + 2 * num_query_groups

    linear_qkv = linear_qkv.reshape([qkv_total_dim, head_size, hidden_size])
    q_slice = torch.cat(
        [
            torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
            for i in range(num_query_groups)
        ]
    )
    k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
    v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))

    q_proj = linear_qkv[q_slice].reshape(-1, hidden_size).cpu()
    k_proj = linear_qkv[k_slice].reshape(-1, hidden_size).cpu()
    v_proj = linear_qkv[v_slice].reshape(-1, hidden_size).cpu()

    return q_proj, k_proj, v_proj


@io.state_transform(
    source_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"),
    target_key="decoder.layers.*.mlp.linear_fc1.weight",
)
def _import_linear_fc1(down, gate):
    return torch.cat((down, gate), axis=0).float()


@io.state_transform(
    source_key="decoder.layers.*.mlp.linear_fc1.weight",
    target_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"),
)
def _export_linear_fc1(linear_fc1):
    gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0)

    return gate_proj, up_proj


__all__ = [
    "GemmaConfig",
    "GemmaConfig2B",
    "GemmaConfig7B",
    "CodeGemmaConfig2B",
    "CodeGemmaConfig7B",
    "GemmaModel",
]
        '''

with open("/kaggle/working/NeMo/nemo/collections/llm/gpt/model/gemma.py", "w") as file:
    file.write(write)

In [None]:
write = '''
from dataclasses import dataclass, field
from pathlib import Path
from typing import TYPE_CHECKING, Callable, List, Optional

import pytorch_lightning as pl
import torch
import torch.nn.functional as F
from typing_extensions import Annotated

from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel
from nemo.collections.llm.utils import Config
from nemo.lightning import io, teardown
from nemo.lightning.pytorch.optim import MyOptimizerModule

if TYPE_CHECKING:
    from transformers import MistralConfig, MistralForCausalLM

    from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
    from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec


@dataclass
class MistralConfig7B(GPTConfig):
    normalization: str = "RMSNorm"
    activation_func: Callable = F.silu
    position_embedding_type: str = "rope"
    add_bias_linear: bool = False
    gated_linear_unit: bool = True
    apply_query_key_layer_scaling: bool = False  # TODO: Should this be True?

    num_layers: int = 32
    hidden_size: int = 4096
    num_attention_heads: int = 32
    num_query_groups: int = 8
    ffn_hidden_size: int = 14336
    seq_length: int = 32768

    init_method_std: float = 0.02
    layernorm_epsilon: float = 1e-5
    window_size: List[int] = field(default_factory=lambda: [4096, 0])


class MistralModel(GPTModel):
    def __init__(
        self,
        config: Annotated[Optional[MistralConfig7B], Config[MistralConfig7B]] = None,
        optim: Optional[MyOptimizerModule] = None,
        tokenizer: Optional["TokenizerSpec"] = None,
    ):
        super().__init__(config or MistralConfig7B(), optim=optim, tokenizer=tokenizer)


@io.model_importer(MistralModel, "hf")
class HFMistralImporter(io.ModelConnector["MistralForCausalLM", MistralModel]):
    def init(self) -> MistralModel:
        return MistralModel(self.config, tokenizer=self.tokenizer)

    def apply(self, output_path: Path) -> Path:
        from transformers import MistralForCausalLM

        source = MistralForCausalLM.from_pretrained(str(self))
        target = self.init()
        trainer = self.nemo_setup(target)
        self.convert_state(source, target)
        self.nemo_save(output_path, trainer)

        print(f"Converted Mistral 7B model to Nemo, model saved to {output_path}")

        teardown(trainer, target)
        del trainer, target

        return output_path

    def convert_state(self, source, target):
        mapping = {
            "model.embed_tokens.weight": "embedding.word_embeddings.weight",
            "model.layers.*.self_attn.o_proj.weight": "decoder.layers.*.self_attention.linear_proj.weight",
            "model.layers.*.mlp.down_proj.weight": "decoder.layers.*.mlp.linear_fc2.weight",
            "model.layers.*.input_layernorm.weight": "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight",
            "model.layers.*.post_attention_layernorm.weight": "decoder.layers.*.mlp.linear_fc1.layer_norm_weight",
            "model.norm.weight": "decoder.final_layernorm.weight",
            "lm_head.weight": "output_layer.weight",
        }

        return io.apply_transforms(source, target, mapping=mapping, transforms=[_import_qkv, _import_linear_fc1])

    @property
    def tokenizer(self) -> "AutoTokenizer":
        from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer

        return AutoTokenizer(str(self))

    @property
    def config(self) -> MistralConfig7B:
        from transformers import MistralConfig

        source = MistralConfig.from_pretrained(str(self))

        def make_vocab_size_divisible_by(mistral_vocab_size):
            base = 128
            while mistral_vocab_size % base != 0:
                base //= 2
            return base

        output = MistralConfig7B(
            seq_length=source.sliding_window,
            num_layers=source.num_hidden_layers,
            hidden_size=source.hidden_size,
            ffn_hidden_size=source.intermediate_size,
            num_attention_heads=source.num_attention_heads,
            # max_position_embeddings=source.max_position_embeddings,
            init_method_std=source.initializer_range,
            layernorm_epsilon=source.rms_norm_eps,
            num_query_groups=source.num_key_value_heads,
            rotary_base=source.rope_theta,
            gated_linear_unit=True,
            make_vocab_size_divisible_by=make_vocab_size_divisible_by(source.vocab_size),
            window_size=[source.sliding_window, 0],
            share_embeddings_and_output_weights=False,
        )

        return output


@io.model_exporter(MistralModel, "hf")
class HFMistralExporter(io.ModelConnector[MistralModel, "MistralForCausalLM"]):
    def init(self) -> "MistralForCausalLM":
        from transformers import AutoModelForCausalLM

        return AutoModelForCausalLM.from_config(self.config)

    def apply(self, output_path: Path) -> Path:
        # TODO: Make it work with lazy init
        # with torch.device("meta"):
        #     target = self.init()
        target = self.init()
        source, _ = self.nemo_load(str(self))
        target = self.convert_state(source, target)

        # TODO: Make sure we don't need to do this
        target = target.cpu()
        target.save_pretrained(output_path)
        self.tokenizer.save_pretrained(output_path)

        return output_path

    def convert_state(self, source, target):
        mapping = {
            "embedding.word_embeddings.weight": "model.embed_tokens.weight",
            "decoder.layers.*.self_attention.linear_proj.weight": "model.layers.*.self_attn.o_proj.weight",
            "decoder.layers.*.mlp.linear_fc2.weight": "model.layers.*.mlp.down_proj.weight",
            "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "model.layers.*.input_layernorm.weight",
            "decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "model.layers.*.post_attention_layernorm.weight",
            "decoder.final_layernorm.weight": "model.norm.weight",
            "output_layer.weight": "lm_head.weight",
        }

        return io.apply_transforms(source, target, mapping=mapping, transforms=[_export_qkv, _export_linear_fc1])

    @property
    def tokenizer(self):
        return io.load_ckpt(str(self)).model.tokenizer.tokenizer

    @property
    def config(self) -> "MistralConfig":
        source: MistralConfig7B = io.load_ckpt(str(self)).model.config

        from transformers import MistralConfig as HfMistralConfig

        return HfMistralConfig(
            sliding_window=source.window_size[0],
            num_hidden_layers=source.num_layers,
            hidden_size=source.hidden_size,
            intermediate_size=source.ffn_hidden_size,
            num_attention_heads=source.num_attention_heads,
            max_position_embeddings=source.seq_length,
            initializer_range=source.init_method_std,
            rms_norm_eps=source.layernorm_epsilon,
            num_key_value_heads=source.num_query_groups,
            rope_theta=source.rotary_base,
            vocab_size=self.tokenizer.vocab_size,
        )


@io.state_transform(
    source_key=(
        "model.layers.*.self_attn.q_proj.weight",
        "model.layers.*.self_attn.k_proj.weight",
        "model.layers.*.self_attn.v_proj.weight",
    ),
    target_key="decoder.layers.*.self_attention.linear_qkv.weight",
)
def _import_qkv(ctx: io.TransformCTX, q, k, v):
    megatron_config = ctx.target.config

    head_num = megatron_config.num_attention_heads
    num_query_groups = megatron_config.num_query_groups
    heads_per_group = head_num // num_query_groups
    hidden_size = megatron_config.hidden_size
    head_num = megatron_config.num_attention_heads
    head_size = hidden_size // head_num

    old_tensor_shape = q.size()
    new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
    new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:]

    q = q.view(*new_q_tensor_shape)
    k = k.view(*new_kv_tensor_shape)
    v = v.view(*new_kv_tensor_shape)

    qkv_weights_l = []
    for i in range(num_query_groups):
        qkv_weights_l.append(q[i * heads_per_group : (i + 1) * heads_per_group, :, :])
        qkv_weights_l.append(k[i : i + 1, :, :])
        qkv_weights_l.append(v[i : i + 1, :, :])
    qkv_weights = torch.cat(qkv_weights_l)
    assert qkv_weights.ndim == 3, qkv_weights.shape
    assert qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups, qkv_weights.shape
    assert qkv_weights.shape[1] == head_size, qkv_weights.shape
    assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape

    qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])

    return qkv_weights


@io.state_transform(
    source_key="decoder.layers.*.self_attention.linear_qkv.weight",
    target_key=(
        "model.layers.*.self_attn.q_proj.weight",
        "model.layers.*.self_attn.k_proj.weight",
        "model.layers.*.self_attn.v_proj.weight",
    ),
)
def _export_qkv(ctx: io.TransformCTX, linear_qkv):
    megatron_config = ctx.source.config

    head_num = megatron_config.num_attention_heads
    num_query_groups = megatron_config.num_query_groups
    heads_per_group = head_num // num_query_groups
    hidden_size = megatron_config.hidden_size
    head_num = megatron_config.num_attention_heads
    head_size = hidden_size // head_num
    qkv_total_dim = head_num + 2 * num_query_groups

    linear_qkv = linear_qkv.reshape([qkv_total_dim, head_size, hidden_size])
    q_slice = torch.cat(
        [
            torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
            for i in range(num_query_groups)
        ]
    )
    k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
    v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))

    q_proj = linear_qkv[q_slice].reshape(-1, hidden_size).cpu()
    k_proj = linear_qkv[k_slice].reshape(-1, hidden_size).cpu()
    v_proj = linear_qkv[v_slice].reshape(-1, hidden_size).cpu()

    return q_proj, k_proj, v_proj


@io.state_transform(
    source_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"),
    target_key="decoder.layers.*.mlp.linear_fc1.weight",
)
def _import_linear_fc1(down, gate):
    return torch.cat((down, gate), axis=0).float()


@io.state_transform(
    source_key="decoder.layers.*.mlp.linear_fc1.weight",
    target_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"),
)
def _export_linear_fc1(linear_fc1):
    gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0)

    return gate_proj, up_proj
        '''

with open("/kaggle/working/NeMo/nemo/collections/llm/gpt/model/mistral.py", "w") as file:
    file.write(write)

In [None]:
write = '''
from dataclasses import dataclass
from typing import TYPE_CHECKING, Dict, Literal, Optional

import pytorch_lightning as L
import torch
import torch.distributed
from megatron.core.optimizer import OptimizerConfig
from megatron.core.transformer.transformer_config import TransformerConfig

from nemo.collections.llm import fn
from nemo.lightning import get_vocab_size, io
from nemo.lightning.megatron_parallel import MaskedTokenLossReduction
from nemo.lightning.pytorch.optim import MegatronOptimizerModule, MyOptimizerModule

if TYPE_CHECKING:
    from megatron.core.models.gpt.gpt_model import GPTModel as MCoreGPTModel

    from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec


@dataclass
class GPTConfig(TransformerConfig, io.IOMixin):
    # From megatron.core.models.gpt.gpt_model.GPTModel
    fp16_lm_cross_entropy: bool = False
    parallel_output: bool = True
    share_embeddings_and_output_weights: bool = True
    make_vocab_size_divisible_by: int = 128
    position_embedding_type: Literal["learned_absolute", "rope"] = "learned_absolute"
    rotary_base: int = 10000
    rotary_percent: float = 1.0
    seq_len_interpolation_factor: Optional[float] = None
    seq_length: int = 1024

    # TODO: Move this to better places?
    get_attention_mask_from_fusion: bool = False

    def configure_model(self, tokenizer) -> "MCoreGPTModel":
        vp_size = self.virtual_pipeline_model_parallel_size
        if vp_size:
            p_size = self.pipeline_model_parallel_size
            assert (
                self.num_layers // p_size
            ) % vp_size == 0, "Make sure the number of model chunks is the same across all pipeline stages."

        from megatron.core import parallel_state
        from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
        from megatron.core.models.gpt.gpt_model import GPTModel as MCoreGPTModel

        return MCoreGPTModel(
            self,
            transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(self.num_moe_experts),
            vocab_size=get_vocab_size(self, tokenizer.vocab_size, self.make_vocab_size_divisible_by),
            max_sequence_length=self.seq_length,
            fp16_lm_cross_entropy=self.fp16_lm_cross_entropy,
            parallel_output=self.parallel_output,
            share_embeddings_and_output_weights=self.share_embeddings_and_output_weights,
            position_embedding_type=self.position_embedding_type,
            rotary_percent=self.rotary_percent,
            rotary_base=self.rotary_base,
            seq_len_interpolation_factor=self.seq_len_interpolation_factor,
            pre_process=parallel_state.is_pipeline_first_stage(),
            post_process=parallel_state.is_pipeline_last_stage(),
        )


class GPTModel(L.LightningModule, io.IOMixin, io.ConnectorMixin, fn.FNMixin):
    def __init__(
        self,
        config: GPTConfig,
        # TODO: Add transformer_layer_spec when we update mcore
        optim: Optional[MyOptimizerModule] = None,
        tokenizer: Optional["TokenizerSpec"] = None,
    ):
        super().__init__()
        self.config = config
        self.tokenizer = tokenizer
        self.optim = optim or MegatronOptimizerModule(config=OptimizerConfig(lr=1e-4, use_distributed_optimizer=True))
        self.optim.connect(self)  # This will bind the `configure_optimizers` method

    def configure_model(self) -> None:
        if not hasattr(self, "module"):
            self.module = self.config.configure_model(self.tokenizer)

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        attention_mask: torch.Tensor,
        labels: Optional[torch.Tensor] = None,
        decoder_input: Optional[torch.Tensor] = None,
        inference_params=None,
    ) -> torch.Tensor:
        output_tensor = self.module(
            input_ids,
            position_ids,
            attention_mask,
            decoder_input=decoder_input,
            labels=labels,
            inference_params=inference_params,
        )

        return output_tensor

    def data_step(self, dataloader_iter) -> Dict[str, torch.Tensor]:
        return gpt_data_step(dataloader_iter)

    def forward_step(self, batch) -> torch.Tensor:
        return gpt_forward_step(self, batch)

    def training_step(self, batch, batch_idx=None) -> torch.Tensor:
        # In mcore the loss-function is part of the forward-pass (when labels are provided)

        return self.forward_step(batch)

    def validation_step(self, batch, batch_idx=None) -> torch.Tensor:
        # In mcore the loss-function is part of the forward-pass (when labels are provided)

        return self.forward_step(batch)

    def training_loss_reduction(self) -> MaskedTokenLossReduction:
        return MaskedTokenLossReduction()

    def validation_loss_reduction(self) -> MaskedTokenLossReduction:
        return MaskedTokenLossReduction(validation_step=True)


def gpt_data_step(dataloader_iter) -> Dict[str, torch.Tensor]:
    from megatron.core import parallel_state

    # Based on: https://github.com/NVIDIA/Megatron-LM/blob/main/pretrain_gpt.py#L87
    # https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py#L828-L842

    batch = next(dataloader_iter)

    _batch: dict
    if isinstance(batch, tuple) and len(batch) == 3:
        _batch = batch[0]
    else:
        _batch = batch

    required_keys = set()
    required_keys.add("attention_mask")
    if parallel_state.is_pipeline_first_stage():
        required_keys.update(("tokens", "position_ids"))
    if parallel_state.is_pipeline_last_stage():
        required_keys.update(("labels", "loss_mask"))
    # if self.get_attention_mask_from_fusion:
    #     required_keys.remove('attention_mask')

    _batch = {key: val.cuda(non_blocking=True) if key in required_keys else None for key, val in _batch.items()}
    # slice batch along sequence dimension for context parallelism
    output = get_batch_on_this_context_parallel_rank(_batch)

    return output


def gpt_forward_step(model, batch) -> torch.Tensor:
    forward_args = {
        "input_ids": batch["tokens"],
        "position_ids": batch["position_ids"],
        "attention_mask": batch["attention_mask"],
        "labels": batch["labels"],
    }

    if 'cu_seqlens' in batch:
        forward_args['packed_seq_params'] = get_packed_seq_params(batch)

    return model(**forward_args)


def get_batch_on_this_context_parallel_rank(batch):
    from megatron.core import parallel_state

    if (cp_size := parallel_state.get_context_parallel_world_size()) > 1:
        num_valid_tokens_in_ub = None
        if 'loss_mask' in batch and batch['loss_mask'] is not None:
            num_valid_tokens_in_ub = batch['loss_mask'].sum()

        cp_rank = parallel_state.get_context_parallel_rank()
        for key, val in batch.items():
            if val is not None:
                seq_dim = 1 if key != 'attention_mask' else 2
                _val = val.view(
                    *val.shape[0:seq_dim],
                    2 * cp_size,
                    val.shape[seq_dim] // (2 * cp_size),
                    *val.shape[(seq_dim + 1) :],
                )
                index = torch.tensor([cp_rank, (2 * cp_size - cp_rank - 1)], device="cpu", pin_memory=True).cuda(
                    non_blocking=True
                )
                _val = _val.index_select(seq_dim, index)
                _val = _val.view(*val.shape[0:seq_dim], -1, *_val.shape[(seq_dim + 2) :])
                batch[key] = _val
        batch['num_valid_tokens_in_ub'] = num_valid_tokens_in_ub
    return batch


def get_packed_seq_params(batch):
    from megatron.core.packed_seq_params import PackedSeqParams

    cu_seqlens = batch['cu_seqlens'].squeeze()  # remove batch size dimension (mbs=1)
    # remove -1 "paddings" added in collate_fn
    if (cu_seqlens_argmin := batch.get('cu_seqlens_argmin', None)) is not None:
        # pre-compute cu_seqlens_argmin in dataset class for perf
        cu_seqlens = cu_seqlens[: cu_seqlens_argmin.item()]
    else:
        cu_seqlens = cu_seqlens[: torch.argmin(cu_seqlens)]

    # pre-compute max_seqlens in dataset class for perf
    max_seqlen = batch['max_seqlen'].squeeze() if 'max_seqlen' in batch else None

    # these args are passed eventually into TEDotProductAttention.forward()
    return PackedSeqParams(
        cu_seqlens_q=cu_seqlens,
        cu_seqlens_kv=cu_seqlens,
        max_seqlen_q=max_seqlen,
        max_seqlen_kv=max_seqlen,
        qkv_format='thd',
    )


__all__ = ["GPTModel", "GPTConfig", "gpt_data_step", "gpt_forward_step"]
        '''

with open("/kaggle/working/NeMo/nemo/collections/llm/gpt/model/base.py", "w") as file:
    file.write(write)

In [None]:
write = '''
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Callable, Optional

import torch
import torch.nn.functional as F

from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel
from nemo.lightning import io, teardown
from nemo.lightning.pytorch.optim import MyOptimizerModule

if TYPE_CHECKING:
    from transformers import MistralConfig, MistralForCausalLM

    from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer


@dataclass
class MixtralConfig8x7B(GPTConfig):
    """
    Config for Mixtral-8x7B model
    Official announcement: https://mistral.ai/news/mixtral-of-experts/
    """

    normalization: str = "RMSNorm"
    activation_func: Callable = F.silu
    position_embedding_type: str = "rope"
    add_bias_linear: bool = False
    gated_linear_unit: bool = True
    apply_query_key_layer_scaling: bool = False  # TODO: Should this be True?

    num_layers: int = 32
    hidden_size: int = 4096
    num_attention_heads: int = 32
    num_query_groups: int = 8
    ffn_hidden_size: int = 14336
    max_position_embeddings: int = 4096  # 32768
    seq_length: int = 4096  # 32768
    # MoE
    num_moe_experts: int = 8
    moe_router_topk: int = 1

    init_method_std: float = 0.02
    layernorm_epsilon: float = 1e-5
    # rotary
    rotary_percent: float = 0.5
    rotary_base: float = 10000


class MixtralModel(GPTModel):
    def __init__(
        self,
        config: Optional[MixtralConfig8x7B] = None,
        optim: Optional[MyOptimizerModule] = None,
        tokenizer: Optional["TokenizerSpec"] = None,
    ):
        super().__init__(config or MixtralConfig8x7B(), optim=optim, tokenizer=tokenizer)


@io.model_importer(MixtralModel, ext="hf")
class HFMixtralImporter(io.ModelConnector["MixtralForCausalLM", MixtralModel]):
    def init(self) -> MixtralModel:
        return MixtralModel(self.config, tokenizer=self.tokenizer)

    def apply(self, output_path: Path) -> Path:
        from transformers import MixtralForCausalLM

        source = MixtralForCausalLM.from_pretrained(str(self))
        target = self.init()
        trainer = self.nemo_setup(target)
        self.convert_state(source, target)
        self.nemo_save(output_path, trainer)

        teardown(trainer, target)
        del trainer, target

        return output_path

    def convert_state(self, source, target):
        mapping = {
            "model.embed_tokens.weight": "embedding.word_embeddings.weight",
            "model.layers.*.self_attn.o_proj.weight": "decoder.layers.*.self_attention.linear_proj.weight",
            "model.layers.*.input_layernorm.weight": "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight",
            "model.layers.*.post_attention_layernorm.weight": "decoder.layers.*.pre_mlp_layernorm.weight",
            # MoE
            "model.layers.*.block_sparse_moe.experts.*.w2.weight": "decoder.layers.*.mlp.experts.local_experts.*.linear_fc2.weight",
            "model.layers.*.block_sparse_moe.gate.weight": "decoder.layers.*.mlp.router.weight",
            # lm-head
            "model.norm.weight": "decoder.final_layernorm.weight",
            "lm_head.weight": "output_layer.weight",
        }

        return io.apply_transforms(source, target, mapping=mapping, transforms=[_import_qkv, _import_moe_w1_w3])

    @property
    def tokenizer(self) -> "AutoTokenizer":
        from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer

        return AutoTokenizer(str(self))

    @property
    def config(self) -> MixtralConfig8x7B:
        from transformers import MixtralConfig as HfMixtralConfig

        config = HfMixtralConfig.from_pretrained(str(self))
        return MixtralConfig8x7B(
            activation_func=F.silu,
            # network
            num_layers=config.num_hidden_layers,
            hidden_size=config.hidden_size,
            ffn_hidden_size=config.intermediate_size,
            max_position_embeddings=config.max_position_embeddings,  # TODO
            seq_length=config.max_position_embeddings,
            # RoPE
            position_embedding_type='rope',
            rotary_base=config.rope_theta,
            # Transformer config
            num_attention_heads=config.num_attention_heads,
            num_query_groups=config.num_key_value_heads,
            num_moe_experts=config.num_local_experts,
            moe_router_topk=config.num_experts_per_tok,
            # norm
            normalization='RMSNorm',
            layernorm_epsilon=config.rms_norm_eps,
            # Init
            init_method_std=config.initializer_range,
            gated_linear_unit=True,
            # Vocab
            make_vocab_size_divisible_by=128,
        )


@io.state_transform(
    source_key=(
        "model.layers.*.self_attn.q_proj.weight",
        "model.layers.*.self_attn.k_proj.weight",
        "model.layers.*.self_attn.v_proj.weight",
    ),
    target_key="decoder.layers.*.self_attention.linear_qkv.weight",
)
def _import_qkv(ctx: io.TransformCTX, q, k, v):
    megatron_config = ctx.target.config

    head_num = megatron_config.num_attention_heads
    num_query_groups = megatron_config.num_query_groups
    heads_per_group = head_num // num_query_groups
    hidden_size = megatron_config.hidden_size
    head_num = megatron_config.num_attention_heads
    head_size = hidden_size // head_num

    old_tensor_shape = q.size()
    new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
    new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:]

    q = q.view(*new_q_tensor_shape)
    k = k.view(*new_kv_tensor_shape)
    v = v.view(*new_kv_tensor_shape)

    qkv_weights_l = []
    for i in range(num_query_groups):
        qkv_weights_l.append(q[i * heads_per_group : (i + 1) * heads_per_group, :, :])
        qkv_weights_l.append(k[i : i + 1, :, :])
        qkv_weights_l.append(v[i : i + 1, :, :])
    qkv_weights = torch.cat(qkv_weights_l)
    assert qkv_weights.ndim == 3, qkv_weights.shape
    assert qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups, qkv_weights.shape
    assert qkv_weights.shape[1] == head_size, qkv_weights.shape
    assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape

    qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])

    return qkv_weights


@io.state_transform(
    source_key=(
        "model.layers.*.block_sparse_moe.experts.*.w1.weight",
        "model.layers.*.block_sparse_moe.experts.*.w3.weight",
    ),
    target_key="decoder.layers.*.mlp.experts.local_experts.*.linear_fc1.weight",
)
def _import_moe_w1_w3(gate_proj, up_proj):
    return torch.cat((gate_proj, up_proj), axis=0)
        '''

with open("/kaggle/working/NeMo/nemo/collections/llm/gpt/model/mixtral.py", "w") as file:
    file.write(write)

In [None]:
write = '''from nemo.lightning.pytorch.optim.base import LRSchedulerModule, MyOptimizerModule
from nemo.lightning.pytorch.optim.lr_scheduler import (
    CosineAnnealingScheduler,
    InverseSquareRootAnnealingScheduler,
    NoamAnnealingScheduler,
    NoamHoldAnnealingScheduler,
    PolynomialDecayAnnealingScheduler,
    PolynomialHoldDecayAnnealingScheduler,
    SquareAnnealingScheduler,
    SquareRootAnnealingScheduler,
    T5InverseSquareRootAnnealingScheduler,
    WarmupAnnealingScheduler,
    WarmupHoldPolicyScheduler,
    WarmupPolicyScheduler,
)
from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule

__all__ = [
    "MyOptimizerModule",
    "LRSchedulerModule",
    "MegatronOptimizerModule",
    "WarmupPolicyScheduler",
    "WarmupHoldPolicyScheduler",
    "SquareAnnealingScheduler",
    "SquareRootAnnealingScheduler",
    "NoamAnnealingScheduler",
    "NoamHoldAnnealingScheduler",
    "WarmupAnnealingScheduler",
    "InverseSquareRootAnnealingScheduler",
    "T5InverseSquareRootAnnealingScheduler",
    "PolynomialDecayAnnealingScheduler",
    "PolynomialHoldDecayAnnealingScheduler",
    "CosineAnnealingScheduler",
]
        '''

with open("/kaggle/working/NeMo/nemo/lightning/pytorch/optim/__init__.py", "w") as file:
    file.write(write)

## Review CheckPoint 🕵️‍♂️✅
### Reviewing a saved checkpoint of the model to continue training from a previous state.

In [None]:
!ls "/kaggle/input/sussy-baka/results/Some name of our experiment/checkpoints"

In [None]:
#!rm -r "/kaggle/working/results/Some name of our experiment/checkpoints/Some name of our experiment--val_wer=0.5307-epoch=42.ckpt"

In [None]:
#!mv "/kaggle/working/results/Some name of our experiment/checkpoints/Some name of our experiment--val_wer=0.4992-epoch=40-last.ckpt" "/kaggle/working/results/Some name of our experiment/checkpoints/Some name of our experiment--val_wer=0.4992-epoch=40.ckpt"

# Training Model 🚂💻
### Training the model on the given dataset using the defined configurations and hyperparameters.

In [None]:
TOKENIZER='/kaggle/working/tokinzers/sus/tokenizer_spe_unigram_v128'
TRAIN_MANIFEST='/kaggle/input/aic-manifests/train_manifest.json'
VAL_MANIFEST='/kaggle/input/aic-manifests/test_manifest.json'
NEMO_ROOT='/kaggle/working/NeMo'

! HYDRA_FULL_ERROR=1 python /kaggle/working/NeMo/examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
  --config-path=../conf/conformer/ \
  --config-name=conformer_ctc_bpe \
  exp_manager.name="Some name of our experiment" \
  exp_manager.resume_if_exists=true \
  exp_manager.resume_ignore_no_checkpoint=true \
  exp_manager.exp_dir=results/ \
  model.tokenizer.dir=$TOKENIZER \
  model.train_ds.manifest_filepath=$TRAIN_MANIFEST \
  model.validation_ds.manifest_filepath=$VAL_MANIFEST \
  exp_manager.create_wandb_logger=True \
  exp_manager.wandb_logger_kwargs.project="'ASR'"

# Fine Tuning on Adapt dataset 🛠️📈
### Performing fine-tuning on a specialized dataset (Adapt dataset) to further refine the model’s accuracy and performance.

In [None]:
TOKENIZER='/kaggle/working/tokinzers/sus/tokenizer_spe_unigram_v128'
TRAIN_MANIFEST='/kaggle/input/adapt-split-manifest/data_85.json'
VAL_MANIFEST='/kaggle/input/adapt-split-manifest/data_15.json'
NEMO_ROOT='/kaggle/working/NeMo'

! HYDRA_FULL_ERROR=1 python /kaggle/working/NeMo/examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
  --config-path=../conf/conformer/ \
  --config-name=conformer_ctc_bpe \
  exp_manager.name="Some name of our experiment" \
  exp_manager.resume_if_exists=true \
  exp_manager.resume_ignore_no_checkpoint=true \
  exp_manager.exp_dir=results/ \
  model.tokenizer.dir=$TOKENIZER \
  model.train_ds.manifest_filepath=$TRAIN_MANIFEST \
  model.validation_ds.manifest_filepath=$VAL_MANIFEST \
  exp_manager.create_wandb_logger=True \
  exp_manager.wandb_logger_kwargs.project="'ASR'"

# Generating CSV file 📄➡️📊
### Creating a CSV file with the results, predictions, or any other relevant data produced by the model.

In [None]:
checkpoint_path = '/kaggle/working/results/Some name of our experiment/checkpoints/Some name of our experiment.nemo'

In [None]:
first_asr_model = nemo_asr.models.EncDecCTCModelBPE.restore_from(checkpoint_path)

In [None]:
import os
import csv

# Placeholder for your ASR model initialization
# first_asr_model = YourASRModel()

# Path to the directory containing .wav files
data_dir = '/kaggle/input/test-data/test'

# List all .wav files in the directory
wav_files = [f for f in os.listdir(data_dir) if f.endswith('.wav')]

# Prepare the list of audio paths
audio_paths = [os.path.join(data_dir, wav) for wav in wav_files]

# Transcribe the audio files in batches (assuming batch_size=4)
batch_size = 4
transcriptions = []

for i in range(0, len(audio_paths), batch_size):
    batch_paths = audio_paths[i:i + batch_size]
    transcripts = first_asr_model.transcribe(audio=batch_paths, batch_size=len(batch_paths))
    transcriptions.extend(transcripts)

# Prepare data for CSV
csv_data = []
for wav, transcript in zip(wav_files, transcriptions):
    audio_name = os.path.splitext(wav)[0]
    csv_data.append([audio_name, transcript])

# Write to CSV
csv_file = 'transcriptions.csv'
with open(csv_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['audio', 'transcript'])
    writer.writerows(csv_data)

print(f"Transcriptions saved to {csv_file}")