In [1]:
import torch
import torchaudio
import librosa
import matplotlib.pyplot as plt
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from torchaudio.models import Conformer, RNNT
from torchaudio.pipelines import WAV2VEC2_ASR_BASE_960H
import torchaudio.transforms as T

# Load audio
def load_audio(filepath, target_sr=16000):
    waveform, sr = torchaudio.load(filepath)
    if sr != target_sr:
        resampler = T.Resample(sr, target_sr)
        waveform = resampler(waveform)
    return waveform.squeeze(), target_sr

# 1. Wav2Vec2 (Transformer + CTC)
def run_wav2vec2(audio_path):
    print("\nRunning Wav2Vec2 (Transformer + CTC)...")
    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
    model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
    model.eval()

    waveform, sr = load_audio(audio_path)
    inputs = processor(waveform, sampling_rate=sr, return_tensors="pt", padding=True)

    with torch.no_grad():
        logits = model(inputs.input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]
    print("Transcription:", transcription)
    return transcription

# 2. Conformer (Torchaudio Pretrained)
def run_conformer(audio_path):
    print("\nRunning Conformer...")
    bundle = torchaudio.pipelines.CONFORMER_RNNT_BASE
    model = bundle.get_model()
    model.eval()

    waveform, sr = load_audio(audio_path)
    emissions, _ = model(waveform.unsqueeze(0))
    decoder = bundle.get_decoder()
    transcription = decoder(emissions[0])
    print("Transcription:", transcription)
    return transcription

# 3. RNN-Transducer (RNNT)
def run_rnnt(audio_path):
    print("\nRunning RNN-Transducer...")
    bundle = torchaudio.pipelines.EMFORMER_RNNT_BASE_LIBRISPEECH
    model = bundle.get_model()
    model.eval()

    waveform, sr = load_audio(audio_path)
    emissions, _ = model(waveform.unsqueeze(0))
    decoder = bundle.get_decoder()
    transcription = decoder(emissions[0])
    print("Transcription:", transcription)
    return transcription

# 4. CTC using torchaudio (Manual)
def run_torchaudio_ctc(audio_path):
    print("\nRunning Torchaudio CTC...")
    bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
    model = bundle.get_model()
    model.eval()

    waveform, sr = load_audio(audio_path)
    emissions, _ = model(waveform.unsqueeze(0))
    decoder = bundle.get_decoder()
    transcription = decoder(emissions[0])
    print("Transcription:", transcription)
    return transcription

# Example usage:
audio_file = "//content//LJ001-0008.wav"

wav2vec2_text = run_wav2vec2(audio_file)
conformer_text = run_conformer(audio_file)
rnnt_text = run_rnnt(audio_file)
torch_ctc_text = run_torchaudio_ctc(audio_file)

# Print comparison
print("\n===== Comparison of Outputs =====")
print("Wav2Vec2:", wav2vec2_text)
print("Conformer:", conformer_text)
print("RNNT:", rnnt_text)
print("Torchaudio CTC:", torch_ctc_text)


Running Wav2Vec2 (Transformer + CTC)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Transcription: HAS NEVER BEEN SURPASSED

Running Conformer...


AttributeError: module 'torchaudio.pipelines' has no attribute 'CONFORMER_RNNT_BASE'