In [1]:
import os
import torch
from datasets import load_dataset, Audio
from transformers import Wav2Vec2ForCTC, AutoProcessor
from src.settings import settings

In [2]:
# English
stream_data = load_dataset(
    "mozilla-foundation/common_voice_13_0",
    "en",
    split="test",
    streaming=True,
    use_auth_token=settings.hf_auth_token,
)
stream_data = stream_data.cast_column("audio", Audio(sampling_rate=16000))
en_sample = next(iter(stream_data))["audio"]["array"]

# Mongolian
stream_data = load_dataset(
    "mozilla-foundation/common_voice_13_0",
    "mn",
    split="test",
    streaming=True,
    use_auth_token=settings.hf_auth_token,
)
stream_data = stream_data.cast_column("audio", Audio(sampling_rate=16000))
mn_sample = next(iter(stream_data))["audio"]["array"]

Reading metadata...: 16372it [00:00, 20141.31it/s]
Reading metadata...: 1877it [00:00, 5226.65it/s]


In [3]:
model_id = "facebook/mms-1b-all"

# facebook/mms-1b-fl102
# facebook/mms-1b-l1107
# facebook/mms-1b-all

processor = AutoProcessor.from_pretrained(model_id)
model = Wav2Vec2ForCTC.from_pretrained(model_id)

In [4]:
inputs = processor(en_sample, sampling_rate=16_000, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs).logits

ids = torch.argmax(outputs, dim=-1)[0]
transcription = processor.decode(ids)
transcription
# 'joe keton disapproved of films and buster also had reservations about the media'

'joe keton disapproved of films and buster also had reservations about the media'

In [14]:
processor.tokenizer.set_target_lang("mon")
model.load_adapter("mon")

inputs = processor(mn_sample, sampling_rate=16_000, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs).logits

ids = torch.argmax(outputs, dim=-1)[0]
transcription = processor.decode(ids)
transcription
# 
# => In English: "soccer analysts describe Messi as the most dangerous player in the world"

'тэр тал дээр анхаар л хандуулж тусгай тугаар бэлтгэж гаргъя гэж ярилзсан'