# Test Transcription

This notebook demonstrates the audio transcription using trained MMS model.

In [1]:
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_path = "../models/mms"
processor = Wav2Vec2Processor.from_pretrained(model_path)
model = Wav2Vec2ForCTC.from_pretrained(model_path)

In [3]:
def load_audio(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    return waveform, sample_rate

In [4]:
def predict(audio_path):
    waveform, sample_rate = load_audio(audio_path)

    # Resample if necessary
    target_sample_rate = 16000
    if sample_rate != target_sample_rate:
        transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
        waveform = transform(waveform)

    # Convert to tensor
    input_values = processor(waveform.squeeze().numpy(), return_tensors="pt", sampling_rate=target_sample_rate).input_values

    # Run model
    with torch.no_grad():
        logits = model(input_values).logits

    # Decode
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]
    
    return transcription

In [7]:
audio_file = "../data/cv-corpus-6.1-indonesian/clips/common_voice_id_19051302.mp3"
transcription = predict(audio_file)
print("Transcription:", transcription)

Transcription: dia meninggal dunia kemarin sia
