# Version Checks (System Dependent)

Installation Scripts Prior to Running:

```bash
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
sudo apt-get update && sudo apt-get install -y libsndfile1 ffmpeg
pip install Cython packaging
pip install nemo_toolkit['asr']
pip install datasets
```

## Numba can detect CUDA

In [1]:
from numba import cuda
print(cuda.is_available())
# Requires numpy 1.24 or greater

True


## Numpy version oops

In [2]:
import numpy
print(numpy.__version__)

1.26.4


## Torch can detect CUDA

In [3]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())

2.7.0+cu126
True


## Proper Nemo installations

In [4]:
import nemo
print(nemo.__version__)

2.3.1


In [5]:
import nemo.collections.asr as nemo_asr

In [6]:
!export CUDA_VISIBLE_DEVICES=0

# Starts

In [8]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("ThePyProgrammer/asr")

README.md:   0%|          | 0.00/326 [00:00<?, ?B/s]

train-00000-of-00008.parquet:   0%|          | 0.00/437M [00:00<?, ?B/s]

train-00001-of-00008.parquet:   0%|          | 0.00/441M [00:00<?, ?B/s]

train-00002-of-00008.parquet:   0%|          | 0.00/448M [00:00<?, ?B/s]

train-00003-of-00008.parquet:   0%|          | 0.00/434M [00:00<?, ?B/s]

train-00004-of-00008.parquet:   0%|          | 0.00/435M [00:00<?, ?B/s]

train-00005-of-00008.parquet:   0%|          | 0.00/434M [00:00<?, ?B/s]

train-00006-of-00008.parquet:   0%|          | 0.00/453M [00:00<?, ?B/s]

train-00007-of-00008.parquet:   0%|          | 0.00/447M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4500 [00:00<?, ? examples/s]

In [20]:
import torch
import os
import json
import librosa
import glob
import subprocess
import nemo.collections.asr as nemo_asr
from omegaconf import OmegaConf, DictConfig
from nemo.core.config import hydra_runner
from nemo.utils import logging
from nemo.utils.exp_manager import exp_manager

import lightning.pytorch as pl
# Correct import
from lightning.pytorch import Trainer
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping
from lightning.pytorch.loggers import TensorBoardLogger

print("PyTorch CUDA available:", torch.cuda.is_available(), "CUDA version:", torch.version.cuda)

# Download and prepare AN4 data (same as before)
DATA_DIR = os.getcwd() + "/files/"
os.environ["DATA_DIR"] = DATA_DIR


PyTorch CUDA available: True CUDA version: 12.6


In [10]:
# if not os.path.exists(f"{DATA_DIR}/an4_sphere.tar.gz"):
#     !wget https://dldata-public.s3.us-east-2.amazonaws.com/an4_sphere.tar.gz

# # Extract and convert data
# if not os.path.exists(f"{DATA_DIR}/an4"):
#     !tar -xvf an4_sphere.tar.gz
    
#     !mv an4 $DATA_DIR

In [11]:
from huggingface_hub import snapshot_download

directory = snapshot_download(repo_id="ThePyProgrammer/asr", repo_type="dataset")

Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/2.46k [00:00<?, ?B/s]

In [19]:
import pyarrow.parquet as pq
j = 0
transcript = []
for i in range(8):
    df = pq.read_table(f'{directory}/data/train-0000{i}-of-00008.parquet').to_pandas()
    transcript += df['transcript'].to_list()
    for _, k in df['audio'].items():
        res = k['bytes']
        with open(f'files/{j}.wav', mode='bx') as f:
            f.write(res)
        j += 1

In [None]:
from tqdm import tqdm
def an4_build_manifest(trans_array, manifest_path, target_wavs_dir):
    """Build an AN4 manifest from a given transcript file."""
    maxi, mini = -1, 1000
    with open(manifest_path, 'w') as fout:
        for idx, line in tqdm(enumerate(trans_array)):
            trans = line
            audio_path = os.path.join(target_wavs_dir, str(idx) + '.wav')
            duration = librosa.core.get_duration(filename=audio_path)
            if maxi < duration: 
                maxi = duration
            if mini > duration:
                mini = duration
            metadata = {"audio_filepath": audio_path, "duration": duration, "text": trans}
            json.dump(metadata, fout)
            fout.write('\n')
    return maxi, mini

# Convert data and build manifests
source_data_dir = f"{DATA_DIR}"
target_data_dir = f"{DATA_DIR}"


    
# Build AN4 manifests
train_manifest = os.path.join(DATA_DIR, 'train_manifest.json')
maxi, mini = an4_build_manifest(transcript, train_manifest, DATA_DIR)

test_manifest = os.path.join(DATA_DIR, 'test_manifest.json')
an4_build_manifest(transcript[:8], test_manifest, DATA_DIR)


2533it [04:09, 10.06it/s]

In [14]:
import argparse
import os
from huggingface_hub import snapshot_download


def download_model(model_name, path, revision="main", local_dir_name=None):
    """
    Download a model from HuggingFace Hub.

    Args:
        model_name (str): Name of the model to download (e.g., facebook/wav2vec2-base-960h)
        path (str): Path where the model will be downloaded
        revision (str): Specific model revision to download
        local_dir_name (str, optional): Custom directory name for the downloaded model

    Returns:
        Path to the downloaded model
    """
    # Create the directory if it doesn't exist
    os.makedirs(path, exist_ok=True)

    # Set the local directory name
    if local_dir_name is None:
        # Use the last part of the model name as directory name
        # e.g., facebook/wav2vec2-base-960h -> wav2vec2-base-960h
        local_dir_name = model_name.split('/')[-1]

    local_dir = os.path.join(path, local_dir_name)

    print(f"Downloading model '{model_name}' (revision: {revision})...")
    print(f"Target directory: {local_dir}")

    try:
        # Download the model
        model_path = snapshot_download(
            repo_id=model_name,
            revision=revision,
            local_dir=local_dir,
            ignore_patterns=["*.msgpack", "*.safetensors", "*.h5", "*.ot", "*.tflite"]
        )
        print(f"Successfully downloaded model to {model_path}")
        return model_path
    except Exception as e:
        print(f"Error downloading model: {e}")
        return None



In [15]:
download_model('nvidia/parakeet-tdt-0.6b-v2', './')

Downloading model 'nvidia/parakeet-tdt-0.6b-v2' (revision: main)...
Target directory: ./parakeet-tdt-0.6b-v2


Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/20.6k [00:00<?, ?B/s]

parakeet-tdt-0.6b-v2.nemo:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Successfully downloaded model to /home/raid/cognition/til/asr/notebooks/parakeet-tdt-0.6b-v2


'/home/raid/cognition/til/asr/notebooks/parakeet-tdt-0.6b-v2'

In [59]:
# Load the pre-trained model .from_pretrained
# model = nemo_asr.models.EncDecRNNTBPEModel.restore_from("parakeet-tdt-0.6b-v2.nemo")
# model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("parakeet-tdt-0.6b-v2.nemo")

# model = nemo_asr.models.ASRModel.from_pretrained(model_name="nvidia/parakeet-tdt-0.6b-v2")
model = nemo_asr.models.ASRModel.restore_from("./parakeet-tdt-0.6b-v2/parakeet-tdt-0.6b-v2.nemo")

# Create training configuration
train_config = DictConfig({
    'manifest_filepath': f'{DATA_DIR}/train_manifest.json',
    'sample_rate': 4500 * 5,
    'batch_size': 2,  # Reduced batch size for stability
    'shuffle': True,
    'num_workers': 2,  # Reduced for stability
    'pin_memory': True,
    'trim_silence': True,
    'max_duration': maxi,
    'min_duration': mini,
    "trim": True,
})

val_config = DictConfig({
    'manifest_filepath': f'{DATA_DIR}/test_manifest.json',
    'sample_rate': 16000,
    'batch_size': 2,
    'shuffle': False,
    'num_workers': 2,
    'pin_memory': True,
     "trim": True,
})


[NeMo I 2025-06-03 13:39:35 nemo_logging:393] Tokenizer SentencePieceTokenizer initialized with 1024 tokens


[NeMo W 2025-06-03 13:39:36 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    use_lhotse: true
    skip_missing_manifest_entries: true
    input_cfg: null
    tarred_audio_filepaths: null
    manifest_filepath: null
    sample_rate: 16000
    shuffle: true
    num_workers: 2
    pin_memory: true
    max_duration: 40.0
    min_duration: 0.1
    text_field: answer
    batch_duration: null
    use_bucketing: true
    bucket_duration_bins: null
    bucket_batch_size: null
    num_buckets: 30
    bucket_buffer_size: 20000
    shuffle_buffer_size: 10000
    
[NeMo W 2025-06-03 13:39:36 nemo_logging:405] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config :

[NeMo I 2025-06-03 13:39:36 nemo_logging:393] PADDING: 0
[NeMo I 2025-06-03 13:39:41 nemo_logging:393] Using RNNT Loss : tdt
    Loss tdt_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0, 'durations': [0, 1, 2, 3, 4], 'sigma': 0.02, 'omega': 0.1}
[NeMo I 2025-06-03 13:39:41 nemo_logging:393] Using RNNT Loss : tdt
    Loss tdt_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0, 'durations': [0, 1, 2, 3, 4], 'sigma': 0.02, 'omega': 0.1}
[NeMo I 2025-06-03 13:39:41 nemo_logging:393] Using RNNT Loss : tdt
    Loss tdt_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0, 'durations': [0, 1, 2, 3, 4], 'sigma': 0.02, 'omega': 0.1}
[NeMo I 2025-06-03 13:39:52 nemo_logging:393] Model EncDecRNNTBPEModel was successfully restored from /kaggle/parakeet-tdt-0.6b-v2/parakeet-tdt-0.6b-v2.nemo.


### Checks

In [60]:
print(type(model))
from lightning.pytorch import LightningModule
print(isinstance(model, LightningModule))

<class 'nemo.collections.asr.models.rnnt_bpe_models.EncDecRNNTBPEModel'>
True


### Training

In [61]:
# Set up training and validation data
model.setup_training_data(train_config)
model.setup_validation_data(val_config)

# Optional but recommended: prepare the model
# model.prepare_for_training()

# Set up logger
tb_logger = TensorBoardLogger(save_dir="../tb_logs", name="Parakeet_ATC_finetune")

# Checkpoint and early stopping on val_wer
checkpoint_callback = ModelCheckpoint(
    monitor="val_wer", mode="min", save_top_k=1,
    dirpath="../checkpoints", filename="best_val_wer"
)
early_stop_callback = EarlyStopping(
    monitor="val_wer", mode="min", patience=5
)

# Initialize Trainer
trainer = Trainer(
    max_epochs=30,
    accelerator="gpu", devices=1,
    logger=tb_logger,
    callbacks=[checkpoint_callback, early_stop_callback],
)

# Train the model
trainer.fit(model)


[NeMo I 2025-06-03 13:40:35 nemo_logging:393] Dataset loaded with 4500 files totalling 31.91 hours
[NeMo I 2025-06-03 13:40:35 nemo_logging:393] 0 files were filtered totalling 0.00 hours
[NeMo I 2025-06-03 13:40:35 nemo_logging:393] Dataset loaded with 8 files totalling 0.06 hours
[NeMo I 2025-06-03 13:40:35 nemo_logging:393] 0 files were filtered totalling 0.00 hours


2025-06-03 13:40:38.093414: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748958038.273797      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748958038.335094      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[NeMo I 2025-06-03 13:40:49 nemo_logging:393] Optimizer config = AdamW (
    Parameter Group 0
        amsgrad: False
        betas: [0.9, 0.98]
        capturable: False
        differentiable: False
        eps: 1e-08
        foreach: None
        fused: None
        lr: 0.0001
        maximize: False
        weight_decay: 0.001
    )
[NeMo I 2025-06-03 13:40:49 nemo_logging:393] Scheduler "<nemo.core.optim.lr_scheduler.CosineAnnealing object at 0x7fd8e97a8910>" 
    will be used during training (effective maximum steps = 16890) - 
    Parameters : 
    (warmup_steps: 0
    warmup_ratio: null
    min_lr: 1.0e-06
    max_steps: 16890
    )


INFO: 
  | Name              | Type                              | Params | Mode 
--------------------------------------------------------------------------------
0 | preprocessor      | AudioToMelSpectrogramPreprocessor | 0      | train
1 | encoder           | ConformerEncoder                  | 608 M  | train
2 | decoder           | RNNTDecoder                       | 7.2 M  | train
3 | joint             | RNNTJoint                         | 1.7 M  | train
4 | loss              | RNNTLoss                          | 0      | train
5 | spec_augmentation | SpectrogramAugmentation           | 0      | train
6 | wer               | WER                               | 0      | train
--------------------------------------------------------------------------------
617 M     Trainable params
0         Non-trainable params
617 M     Total params
2,471.304 Total estimated model params size (MB)
706       Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2025-06-03 13:40:50 nemo_logging:393] Enabled CUDA graphs for module <class 'nemo.collections.asr.models.rnnt_bpe_models.EncDecRNNTBPEModel'>.decoding.decoding
[NeMo I 2025-06-03 13:40:50 nemo_logging:393] Enabled CUDA graphs for module <class 'nemo.collections.asr.metrics.wer.WER'>joint._wer.decoding.decoding
[NeMo I 2025-06-03 13:40:54 nemo_logging:393] 
    
[NeMo I 2025-06-03 13:40:54 nemo_logging:393] reference:Operation Echelon has yielded significant progress in our pursuit of the rogue AI droid BH-2000. Our surveillance drones have identified its current location in sector 7G, and our ground units are mobilizing for a targeted strike. Updates on BH-2000's movements indicate it is attempting to upload critical data, posing a potential threat to our operations. We recommend immediate action to neutralize the target before it can transfer any sensitive information.
[NeMo I 2025-06-03 13:40:54 nemo_logging:393] predicted:Operation Echelon has yielded significant progress in

Training: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2025-06-03 13:40:55 nemo_logging:393] Disabled CUDA graphs for module <class 'nemo.collections.asr.models.rnnt_bpe_models.EncDecRNNTBPEModel'>.decoding.decoding
[NeMo I 2025-06-03 13:40:55 nemo_logging:393] Disabled CUDA graphs for module <class 'nemo.collections.asr.metrics.wer.WER'>joint._wer.decoding.decoding


OutOfMemoryError: CUDA out of memory. Tried to allocate 228.00 MiB. GPU 0 has a total capacity of 15.89 GiB of which 137.12 MiB is free. Process 3638 has 15.75 GiB memory in use. Of the allocated memory 15.08 GiB is allocated by PyTorch, and 364.26 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
model.save_to('model.nemo')