In [1]:
#Assuming we have the manifest created, this script is going to be for transcribing the text using both models and storing them on 2 sepererate text files.
#This script allows us to transcribe any number of .wav files and store the text into a text file.

#_______________________________________________________________________________________________


#Helper_functions.



def softmax(logits):
    e = np.exp(logits - np.max(logits))
    return e / e.sum(axis=-1).reshape([logits.shape[0], 1])

def get_nemo_dataset(config, vocab, sample_rate=16000):
    augmentor = None

    config = {
        'manifest_filepath': os.path.join(config['temp_dir'], 'manifest.json'),
        'sample_rate': sample_rate,
        'labels': vocab,
        'batch_size': min(config['batch_size'], len(config['paths2audio_files'])),
        'trim_silence': True,
        'shuffle': False,
    }

    dataset = AudioToCharDataset(
        manifest_filepath=config['manifest_filepath'],
        labels=config['labels'],
        sample_rate=config['sample_rate'],
        int_values=config.get('int_values', False),
        augmentor=augmentor,
        max_duration=config.get('max_duration', None),
        min_duration=config.get('min_duration', None),
        max_utts=config.get('max_utts', 0),
        blank_index=config.get('blank_index', -1),
        unk_index=config.get('unk_index', -1),
        normalize=config.get('normalize_transcripts', False),
        trim=config.get('trim_silence', True),
        #load_audio=config.get('load_audio', True),
        parser=config.get('parser', 'en'),
        #add_misc=config.get('add_misc', False),
    )

    return torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=config['batch_size'],
        collate_fn=dataset.collate_fn,
        drop_last=config.get('drop_last', False),
        shuffle=True,
        num_workers=config.get('num_workers', 0),
        pin_memory=config.get('pin_memory', False),
    )

def get_letters(probs):
    letters = []
    for idx in range(0, probs.shape[0]):
        current_char_idx = np.argmax(probs[idx])
        if LABELS[current_char_idx] != "blank":
            letters.append([LabelsType[current_char_idx], idx])
    return letters

#_______________________________________________________________________________________________



In [2]:
audio_files_set = [
"/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_02931.wav",
"/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_02773.wav",
"/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_03265.wav",
"/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_01628.wav",
"/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_00706.wav",
 "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_02206.wav",
 "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_04370.wav",
 "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_00756.wav",
 "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_02223.wav",
 "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_03338.wav",
 "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_00660.wav",
 "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_03532.wav",
 "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_02136.wav",
 "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_01817.wav",
 "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_04072.wav",
 "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_04582.wav",
 "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_03324.wav",
 "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_02415.wav",
 "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_01208.wav",
 "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_01582.wav", 
  "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_04280.wav", 
  "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_03545.wav",
  "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_03623.wav",
  "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_02898.wav",
  "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_04468.wav",
  "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_03256.wav",
  "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_00539.wav",
  "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_01788.wav",
  "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_03297.wav", 
  "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_02733.wav",
  "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_02608.wav",
  "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_02724.wav",
  "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_00650.wav",
  "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_04277.wav",
  "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_02851.wav",
  "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_02293.wav",
  "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_01638.wav", 
  "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_02439.wav",
  "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_01959.wav",
  "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_00798.wav", 
  "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_02497.wav",
  "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_03115.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_00100.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_00152.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_00803.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_03123.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_01872.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_02688.wav", 
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_01294.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_04010.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_00715.wav", 
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_03469.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_03719.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_03168.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_01702.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_02942.wav", 
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_00757.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_01185.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_00808.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_02278.wav", 
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_03259.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_01654.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_02960.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_01259.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_03220.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_00538.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_02177.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_00424.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_00272.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_03216.wav", 
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_01501.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_03243.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_02047.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_01543.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_02345.wav", 
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_02164.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_02746.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_00384.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_01212.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_02894.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_02009.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_01642.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_01034.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_01720.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_01839.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_00600.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_00545.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_01919.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_02062.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_00674.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_04365.wav", 
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_02600.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_04583.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_04374.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_01957.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_00814.wav", 
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_03580.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_04354.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullfemale_03561.wav",
   "/mnt/sangraha/venkat_shadeslayer/asr/indictts/hindi/wavs/train_hindifullmale_02282.wav",  


 
]



In [3]:
for i in audio_files_set:
    audio_file = i


    import onnx
    import torch
    onnx_model = onnx.load("/mnt/sangraha/venkat_shadeslayer/asr/momodels/mymodel3.onnx")
    onnx.checker.check_model(onnx_model)


    import onnxruntime as ort
    import numpy as np
    import librosa

    ort_sess = ort.InferenceSession('/mnt/sangraha/venkat_shadeslayer/asr/momodels/mymodel3.onnx')

    audio_filepath = i
    audio_signal, sr = librosa.load(audio_filepath, sr=16000)

In [5]:
length_data = 16000
# audio_signal = np.random.random((1,80,length_data))
# audio_signal = np.array(audio_signal,dtype=np.float32)
S = librosa.feature.melspectrogram(y=audio_signal, sr=sr, n_mels=80)
print(S.shape)
S_new = np.reshape(S, (1, 80, -1))
print(S_new.shape)

len(audio_signal)


(80, 94)
(1, 80, 94)


47916

In [7]:
outputs = ort_sess.run(None, {'audio_signal': S_new, 'length': np.array([length_data],dtype=np.int64)})
outputs[0].squeeze(0)

array([[-3.9329590e+01, -1.4918658e+01, -1.6156431e+01, ...,
        -2.7732710e+01, -3.5699852e+01, -6.9615802e-05],
       [-3.7635418e+01, -1.3395227e+01, -1.5437682e+01, ...,
        -2.7343876e+01, -3.5965874e+01, -2.1219028e-05],
       [-3.6312103e+01, -1.2568659e+01, -1.5509059e+01, ...,
        -2.5955353e+01, -3.5992733e+01, -3.4331686e-05],
       ...,
       [-2.5536222e+01, -6.0488372e+00, -8.7984028e+00, ...,
        -2.0566444e+01, -2.8296503e+01, -6.5437560e-03],
       [-2.8570650e+01, -8.4600286e+00, -1.1000808e+01, ...,
        -2.4977379e+01, -3.4118538e+01, -6.0134922e-04],
       [-2.6104115e+01, -9.4064903e+00, -1.2224298e+01, ...,
        -3.0500118e+01, -3.7189007e+01, -4.2679737e-04]], dtype=float32)

In [9]:
#Helper functions
import numpy as np

def softmax(x):
    e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return e_x / e_x.sum(axis=-1, keepdims=True)

In [10]:
    original_input = outputs

    softmax_outputs = softmax(original_input)

    print("Original array:", original_input)
    print("Softmax result:", softmax_outputs)

Original array: [array([[[-3.9329590e+01, -1.4918658e+01, -1.6156431e+01, ...,
         -2.7732710e+01, -3.5699852e+01, -6.9615802e-05],
        [-3.7635418e+01, -1.3395227e+01, -1.5437682e+01, ...,
         -2.7343876e+01, -3.5965874e+01, -2.1219028e-05],
        [-3.6312103e+01, -1.2568659e+01, -1.5509059e+01, ...,
         -2.5955353e+01, -3.5992733e+01, -3.4331686e-05],
        ...,
        [-2.5536222e+01, -6.0488372e+00, -8.7984028e+00, ...,
         -2.0566444e+01, -2.8296503e+01, -6.5437560e-03],
        [-2.8570650e+01, -8.4600286e+00, -1.1000808e+01, ...,
         -2.4977379e+01, -3.4118538e+01, -6.0134922e-04],
        [-2.6104115e+01, -9.4064903e+00, -1.2224298e+01, ...,
         -3.0500118e+01, -3.7189007e+01, -4.2679737e-04]]], dtype=float32)]
Softmax result: [[[[8.3056899e-18 3.3182491e-07 9.6238885e-08 ... 9.0330971e-13
    3.1314885e-16 9.9993026e-01]
   [4.5200828e-17 1.5223922e-06 1.9746938e-07 ... 1.3326174e-12
    2.4000499e-16 9.9997878e-01]
   [1.6976723e-16 3.47

In [15]:
from pyctcdecode import build_ctcdecoder 
import nemo.collections.asr as nemo_asr


asr_model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(
    model_name='stt_hi_conformer_ctc_medium'
)
logits = asr_model.transcribe([i])[0]

decoder_inputs = np.array(original_input)



decoder = build_ctcdecoder(asr_model.decoder.vocabulary)

output = decoder.decode(logits)

AttributeError: partially initialized module 'pandas' has no attribute 'core' (most likely due to a circular import)