In [None]:
%%capture
!pip install transformers datasets
!git clone https://github.com/sunbirdai/leb.git
!pip install -r leb/requirements.txt
!pip install jiwer

In [None]:
import yaml
import os
import torch
import numpy as np
from torch.utils.data import DataLoader
from transformers import pipeline
from datasets import load_metric
import leb.dataset
import csv

In [None]:
yaml_config = '''
common_source: &common_source
  type: speech
  language: [lug,eng,ach,nyn]
  preprocessing:
    - set_sample_rate:
        rate: 16_000

common_target: &common_target
  type: text
  language: [lug,eng,ach,nyn]
  preprocessing:
    - lower_case
    - clean_and_remove_punctuation

test:
    huggingface_load:
        - path: Sunbird/salt
          name: multispeaker-lug
          split: test
        - path: Sunbird/salt
          name: multispeaker-eng
          split: test
        - path: Sunbird/salt
          name: multispeaker-ach
          split: test
        - path: Sunbird/salt
          name: multispeaker-nyn
          split: test

    source: *common_source
    target: *common_target

'''

In [None]:
auth_token = "xxxxx"


In [None]:
with open("config.yaml", "a") as f:
  print(yaml_config, file=f)

In [None]:
class TranscriptionEvaluator:
    """
    Evaluates transcription quality over multiple languages using a SB MMS models,
    measuring Word Error Rate (WER) across a test set for each language.
    """
    def __init__(self, config_path):
        with open(config_path) as f:
            self.config = yaml.safe_load(f)
        self.test_ds = leb.dataset.create(self.config['test'])
        self.supported_languages = ['ach', 'lug', 'teo', 'nyn']
        self.wer_metric = load_metric("wer")
        self.auth_token = os.environ.get("HF_TOKEN")
        os.environ["HF_TOKEN"] = "xxxxx"
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

    def transcribe_audio_batch(self, audio_files, languages):
        """
        Transcribes a batch of audio files, filtering unsupported languages.
        """
        transcriptions, success_flags = [], []
        for language in set(languages):
            if language not in self.supported_languages:
                print(f"Skipping unsupported language: {language}")
                continue
            transcriptions, success_flags = self.transcribe_for_language(audio_files, languages, language)
        return transcriptions, success_flags

    def transcribe_for_language(self, audio_files, languages, language):
        """
        Handles transcription for a specific language.
        """
        try:
            pipe = self.initialize_pipeline(language)
            lang_indices = [i for i, lang in enumerate(languages) if lang == language]
            lang_audio_files = [audio_files[i] for i in lang_indices]
            outputs = pipe(lang_audio_files)
            return self.process_outputs(outputs, lang_indices), [True] * len(lang_indices)
        except Exception as e:
            print(f"Error processing language {language}: {e}")
            return [], []

    def initialize_pipeline(self, language):
        """
        Initializes the pipeline for a given language.
        """
        model_id = "Sunbird/sunbird-mms"
        pipe = pipeline(model=model_id, device=self.device, token=self.auth_token)
        pipe.tokenizer.set_target_lang(language)
        pipe.model.load_adapter(language)
        return pipe

    def process_outputs(self, outputs, lang_indices):
        """
        Processes the pipeline outputs into transcriptions.
        """
        transcriptions = [None] * len(lang_indices)
        for i, output in enumerate(outputs):
            transcriptions[lang_indices[i]] = output["text"]
        return transcriptions

    def calculate_batch_wer(self, predictions, references):
        """
        Calculates the Word Error Rate (WER) for a batch of predictions and references.
        """
        return self.wer_metric.compute(predictions=predictions, references=references)

    def evaluate(self):
        """
        Evaluates the WER across the test set for each language and prints the results.
        """
        batch_size = 8
        loader = DataLoader(self.test_ds, batch_size=batch_size, collate_fn=lambda x: x)
        total_wer, total_files = 0, 0
        wer_by_language = {}

        # Open a CSV file to write the transcriptions
        with open('transcriptions_comparison.csv', mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(['Language', 'Predicted Transcription', 'True Transcription'])

            for batch in loader:
                audio_files = [np.array(item['source']) for item in batch]
                languages = [item['source.language'] for item in batch]
                true_transcripts = [item['target'] for item in batch]

                predicted_transcripts, success_flags = self.transcribe_audio_batch(audio_files, languages)
                filtered_true_transcripts = [t for t, s in zip(true_transcripts, success_flags) if s]

                if predicted_transcripts and filtered_true_transcripts:
                    print(predicted_transcripts)
                    print(filtered_true_transcripts)
                    batch_wer = self.calculate_batch_wer(predicted_transcripts,filtered_true_transcripts)
                    total_wer += batch_wer * len(filtered_true_transcripts)
                    total_files += len(filtered_true_transcripts)
                    self.update_language_wer(languages, success_flags, batch_wer, wer_by_language)

                    # Write each transcription pair to the CSV, along with its language
                    for language, pred, true in zip(languages, predicted_transcripts, filtered_true_transcripts):
                      writer.writerow([language, pred, true])

        self.print_results(wer_by_language, total_wer, total_files)

    def update_language_wer(self, languages, success_flags, batch_wer, wer_by_language):
        """
        Updates the WER statistics for each language based on the batch results.
        """
        filtered_languages = [lang for lang, success in zip(languages, success_flags) if success]
        for language in filtered_languages:
            if language not in wer_by_language:
                wer_by_language[language] = []
            wer_by_language[language].append(batch_wer)

    def print_results(self, wer_by_language, total_wer, total_files):
        """
        Prints the final WER results by language and overall.
        """
        for language, wers in wer_by_language.items():
            avg_wer = sum(wers) / len(wers)
            print(f"{language}: {avg_wer:.4f}")
        overall_wer = total_wer / total_files if total_files > 0 else 0
        print(f"Overall WER across the test set: {overall_wer:.4f}")

In [None]:
evaluator = TranscriptionEvaluator("config.yaml")
evaluator.evaluate()

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: [

['ekikola kyakasooli kya kyenvu wabula langi yakyo etera okuba eya kitaka wansi', 'buli gi amabala ameru ku bikoola byakasooli galeetebwa biwuka', 'emikolo kitundu ku bulamu', 'emikolo kitundu ku bulamu', 'ekivulu kyabakazanyyirizi kyabadde kitya', 'ekivulu kya baakazannyirizi kyabadde kitya', 'kolera bulwadde obuleetebwa obukyafu', 'kolera bulwadde obuleetebwa obukyafu']
['ekikoola kya kasooli kya kyenvu wabula langi yaakyo etera okuba eya kitaka wansi', 'ebikoola bya kasooli biriiriddwa ebisaanyi', 'emikolo kitundu ku bulamu', 'emikolo kitundu ku bulamu', 'ekivvulu kya bakazanyiikirizi kyabadde kitya', 'ekivvulu kya bakazanyiikirizi kyabadde kitya', 'kolera bulwadde obuleetebwa bukyafu', 'kolera bulwadde obuleetebwa bukyafu']


Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['embuto ezamangu zireetedde omuwendo gwabaana abawala abawanduka mu ssomero okweyongera', 'okusaba kwokweyimirirwa ku kakalu kujja kuwulirwa kkooti ku lwokutaano', 'abantu bangi tebaddizibwa ssente ze bakozesa mu kulabikako mu kkooti', 'buli omu yali mweraliikirivu ku ngeri yokubalirira', 'amakolero agalina abakozi abasukka mu kkumi galowooza kusalako bakozi', 'kimenya mateeka omukozi okukola nga taweebwa luwummula', 'abalimi balina okuwewulwa kwebyo bye basaasaanyizaako okusobola okwongeza ku magoba gaabwe', 'ebisolo bikosebwa nnyo endwadde']
['embuto ezamangu zireetedde omuwendo gwabaana abawala abawanduka mu ssomero okweyongera', 'okusaba kwokweyimirirwa ku kakalu kujja kuwulirwa kkooti ku lwokutaano', 'abantu bangi tebaddizibwa ssente ze bakozesa mu kulabikako mu kkooti', 'buli omu yali mweraliikirivu ku ngeri yokubalirira', 'amakolero agalina abakozi abasukka mu kkumi galowooza kusalako bakozi', 'kimenya mateeka omukozi okukola nga taweebwa luwummula', 'abalimi balina okuwewulwa 

Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['ebyobulimi birina okutumbulwa', 'bulijjo tuyina okuyigira mu nsobi zaffe ezaayita', 'kampuni zamafuta zeetaaga okulaba engeri ennungi ezokukwatamu kasasiro', 'buli luvannyuma lwokulonda ebivudde mu kulonda kwabeesimbyewo birangirirwa', 'bano be beesimbyewo okuva mu bitundu byeggwanga ebyenjawulo', 'abayizi abali mu bibiina ebyakamalirizo bafunye okulambikibwa mu bigezo byabwe ebyakamalirizo', 'ttiimu yayolesezza obukodyo bwomupiira obulungi mu mpaka', 'ekirwadde bunansi kyandi tulemye okutangira naye twasobola okukirwanyisa']
['ebyobulimi birina okutumbulwa', 'bulijjo tulina okuyigira mu nsobi zaffe ezaayita', 'kkampuni zamafuta zeetaaga okulaba engeri ennungi ezokukwatamu kasasiro', 'buli luvannyuma lwokulonda ebivudde mu nkulonda kwabeesimbyewo birangirirwa', 'bano be beesimbyewo okuva mu bintundu byeggwanga ebyenjawulo', 'abayizi abali mu bibiina ebyakamalirizo bafunye okulambikibwa mu bigezo byabwe ebyakamalirizo', 'ttiimu yayolesezza obukodyo bwomupiira obulungi mu mpaka', 'ekir

Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['balina okukkiriza abaana okwasanguza ebirowoozo byabwe', 'okuwulira emisango gyobuliisamaanyi mu kkooti kutwala bbanga ki', 'okunyweza ebyokwerinda mu kyalo kya mugaso nnyo', 'olukalala okuwandiikibwa abalimi luyamba abakungu okumanya bantu ki abeenyigira mu bulimi', 'omuzannyo azanyira ku mabbali gekisaawe asobola okukola ensobi ezenjawulo mu muzannyo natanenyezebwa', 'ensasula embi ereetera emisango', 'mu bunnabyalo kwe kudduka emisinde emiwanvu', 'weewala okwetaba nabantu abalwadde okwewala okusaasaana kwendwadde ezimu']
['balina okukkiriza abaana okwasanguza ebirowoozo byabwe', 'okuwulira emisango gyobuliisamaanyi mu kkooti kutwala bbanga ki', 'okunyweza ebyokwerinda mu kyalo kya mugaso nnyo', 'olukalala okuwandiikibwa abalimi luyamba abakungu okumanya bantu ki abeenyigira mu bulimi', 'omuzannyi azannyira ku mabbali gekisaawe asobola okukola ensobi ezenjawulo mu muzannyo natanenyezebwa', 'ensasula embi ereetera emisango', 'mubunabyalo kwe kudduka emisinde emiwanvu', 'weewale okwe

Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['ekitebe kya amerika mu uganda kisiimiddwa olwobuyambi bwakyo eri ebyobulamu bwa uganda', 'waliwo ensonga ezimu ezekuusa ku nkola yamateeka gaffe', 'ebibuga ebijja biri mu kutondebwawo mu ggwanga okusobola okutumbula obuweereza', 'abantu tebajja kukkirizibwa okufuna kitundu ku ssente ze batereka okutuusa nga bawummudde ku mirimu', 'poliisi etunula mu alipoota zokuwamba abantu okwetooloola kampala', 'abantu tebamanyi kye baagala', 'bakansala ba disitulikiti basasulwa bubi ekikosa empeereza', 'abasawo beetaaga basulemu nnyumba zabakozi okusobola okukola amangu ku balwadde']
['ekitebe kya america mu uganda kisiimiddwa olwobuyambi bwakyo eri ebyobulamu bwa uganda', 'waliwo ensonga ezimu ezeekuusa ku nkola yamateeka gaffe', 'ebibuga ebiggya biri mu kutondebwawo mu ggwanga okusobola okutumbula obuweereza', 'abantu tebajja kukkirizibwa kufuna kitundu ku ssente ze batereka okutuusa nga bawummudde ku mirimu', 'poliisi etunula mu alipoota zokuwamba bantu okwetooloola kampala', 'abantu tebamanyi

Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['emisolo egikungaanyizibwa kuno girina okulondoolwa okulwanyisa okukozesa obubbi bwensimbi', 'katonda wamaanyi', 'okweyongera okwabanoonyiboobubudamu kwaviiriddeko okusaasaana kwobulwadde', 'beetaaga okubuulira abantu ku miganiro gye bajja okufuna', 'okubula ebyokugula nokutunda kitataaganya entambula yoobusuubuzi', 'okulonda kutera kuggwera mu bovu yo', 'abafumbo balina okugonjoola obutakwatagana bwabwe', 'yasadde aka embuzi ye mu bajjajja']
['emisolo egikungaanyizibwa kuno girina okulondoolwa okulwanisa okukozesa obubbi bwensimbi', 'katonda wa maanyi', 'okweyongera okwabanoonyiboobubudamu kwandiviirako okusaasaana kwobulwadde', 'beetaaga okuuulira abantu ku miganyulo gye bajja okufuna', 'okubula ebyokugula nokutunda kitataaganya entambula yobusuubuzi', 'okulonda kuteera kuggwera mu buvuyo', 'abafumbo balina okugonjoola obutakwatagana bwabwe', 'yasaddako embuzi ye mu ba jjaja']


Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['ebirowoozo ebizimba bireeta enkulaakulana mu byenfuna', 'ettaka lirina okuwandiisibwa mu mateeka mu mannya ga nnannyini ryo omutuufu', 'amagye tegalina kwetaba mu byabufuzi nokukola ebintu byonna ebyekuusa ku byobufuzi', 'okwemulugunya kwonna okukwata ku mpeereza zemmotoka ezitambuza abalwadde zirina okutwalibwa eri abakulu ba disitulikiti', 'omuwendo gwabakyala bembuto abafa nga bali mbuto nga bazaala oba nga baakamala okuzaala kweyongera', 'obwakabaka nobwami bukyaliwo', 'abakulembeze bamadiini beetaaga okwenyigira mu mirimu gyobusuubuzi okulongoosa obulamu bwabwe', 'omukyala yayanjulidde bazadde be omwami']
['ebirowoozo ebizimba bireeta enkulaakulana mu byenfuna', 'ettaka lirina okuwandiisibwa mu mateeka mu mannya ga nannyini lyo omutuufu', 'amagye tegalina kwetaba mu byabufuzi nokukola ebintu byonna ebyekuusa ku byobufuzi', 'okwemulugunya kwonna okukwata ku mpereza zemmotoka ezitambuza abalwadde zirina kutwalibwa eri abakulu ba disitulikiti', 'omuwendo gwabakyala bembuto abafa ng

Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['wajja kubaawo pulogulaamu za sikaala eri abo abanaakola obulungi', 'kya mugaso okukuuma nokulabirira obutonde nobulamu bwebisolo byokutale', 'omukungu aweereddwa oluwummula', 'ebikozesebwa mu masomero bijja kugabanyizibwa mu masomero ana mu bitundu byobukiika kkono', 'abanoonyibobubudamu tebalina mmere emala kubeezaawo obulamu bwabwe', 'amasomero gateekeddwa okuwa abayizi ekyokulya', 'gavumenti ejja kwongeza ku nfuluma eyamasannyalaze okuva ku bbibiro lye karuma', 'paaka ya takisi nnungi nnyo okukoleramu emirimu']
['wajja kubaawo pulogulaamu za sikaala eri abo abanaakola obulungi', 'kya mugaso okukuuma nokulabirira obutonde nobulamu bwebisolo byokuttale', 'omukungu aweereddwa oluwummula', 'ebikozesebwa mu masomero bijja kugabanyizibwa mu masomero ana mu bitundu byobukiikakkono', 'abanoonyiboobubudamu tebalina mmere emala kubeezaawo bulamu bwabwe', 'amasomero gateekeddwa okuwa abayizi ekyokulya', 'gvaumenti ejja kwongeza ku nfulumya yamasannyalaze okuva ku bbibiro lrye karuma', 'paaka

Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['ebisolo byomu nsiko bireeta ssente okuva ebweru', 'waliwo okwanguyirwa mu kutuuka mu bbanka amasomero nobutale', 'yasaba abantu okwongera okuwagira abalenzi okutuusa bwe bamaliriza emisomo gyabe', 'ettaka kye kimu ku byobugagga ebisingira ddala jjajjawaffe omukyala bye yatulekera', 'akakiiko ka poliisi akakwasisa empisa kaamalawo obutakkaanya', 'omunoonyereza ku musango alina kukola ki', 'ennyanja nnalubaale mpanvu kye nkana ki', 'baakubaganya ebirowoozo ku kusoomoozebwa abayizi kwe basanga mu ssomero']
['ebisolo byomu nsiko bireeta ssente okuva ebweru', 'waliwo obwanguyirwa mu kutuuka mu bbanka amasomero nobutale', 'yasaba abantu okwongera okuwagira abalenzi okutuusa lwe bamaliriza emisomo gyabwe', 'ettaka kye kimu ku byobugagga ebisingira ddala jjajja waffe omukyala bye yatulekera', 'akakiiko ka poliisi akakwasisa empisa kaamalawo obutakkaanya', 'omunoonyereza ku misango alina kukola ki', 'ennyanja nalubaale mpanvu kyenkana ki', 'baakubaganya ebirowozo ku kusoomoozebwa abayizi kwe 

Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['ekidyeri kisomba emirundi esatu mu lunaku', 'omuwendo gwabantu abaagala okugula gusalawo bungi ki obulina okuleetebwa', 'ababundabunda beebazizza katonda olokubasindikira abagabi boobuyambi okubayamba', 'abasomi mu bibiina ebyakamalirizo bajja kuddayo ku ntandikwa yomwezi gwomukaaga', 'abantu baanyiiga olwabantu abamu butaagala kuzzaayo ssente', 'abantu bekitundu balina okufuna ebyetaagisa nga gavumenti bwe yabasuubiza', 'bannamateeka bomuwawaabirwa baagamba nti baagala okulaba obwenkanya nga bukolebwa', 'abantu basanyufu olwobuyambi bwemmere okuva mu gavumenti']
['ekidyeri kisomba emirundi esatu mu lunaku', 'omuwendo gwabantu abaagala okugula gusalawo bungi ki obulina okuleetebwa', 'ababundabunda bebaziza katonda olwokubasindikira abagabi bobuyambi okubayamba', 'abasomi mu bibiina ebya kamalirizo bajja kuddayo ku ntandiikwa yomwezi gwomukaaga', 'abantu baanyiiga olwabantu abamu obutaagala kuzzaayo ssente', 'abantu bekitundu balina okufuna ebyetaagisa nga gavumenti bwe yabasuubiza', 

Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['abantu basiimye ekitongole olwebyo bye kigezaako okubakolera', 'akakiiko akakubirizi kalina obukuubagano nekitongole kyamawulire ekya westnitle press association', 'abakulembeze beddiini balina okuba abeetoowaze ate nga ba mpisa', 'kirungi okulaga okusiima', 'buli lwokola ekintu ekitali kituufu bambi weetonde', 'tteekateeka ki ezimu kuziteekeddwawo okulwanyisa akawuka ka kolona', 'abatembeye abeewandiisa beemulugunya ku bukyafu obuli mu katale', 'tulinayo mulwadde yenna leero']
['abantu basiimye ekitongole olwebyo bye kigezaako okubakolera', 'akakiiko akakubirizi kayina obukuubagano nekitongole kyamawulire ekya west nile press association', 'abakulembeze beddiini balina okuba abeetoowaze ate nga ba mpisa', 'kirungi okulaga okusiima', 'buli lwokola ekintu ekitali kituufu bambi weetonde', 'nteekateeka ki ezimu ku ziteekeddwawo okulwanyisa akawuka ka kolona', 'abatembeeyi abeewandiisa beemulugunya ku bukyafu obuli mu katale', 'tuyinayo omulwadde yenna leero']


Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['akawuka ka kolona kasaasaana mangu', 'obuli bwenguzi kye ki', 'kakensa aya kolokose empapula ze ezobuyigirize', 'tekinologiya aggulawo enzigi zemikisa empya okukola bizinensi', 'tulindiridde nobugumiikiriza okuteekayo ebitaala byo ku nguudo', 'abalimi abasinga balima ebirime nga bya kutunda', 'omukulembeze asiimye ttiimu yeggwanga eyomupiira gwebigere olwokukola obulungi', 'abawagizi bomupiira baagala ttiimu ewangula']
['akawuka ka kolona kasaasaana mangu', 'obuli bwenguzi kye ki', 'kakensa yakolokose empapula ze ezobuyigirize', 'tekinologiya aggulawo enzigi zemikisa empya okukola bizinensi', 'tulindiridde nobugumiikiriza okuteekayo ebitala byoku nguudo', 'abalimi abasinga balima ebirime nga bya kutunda', 'abakulembeze asiimye ttiimu yeggwanga eyomupiira gwebigere olwokukola obulungi', 'abawagizi bomupiira baagala ttiimu ewangula']
Skipping unsupported language: eng


Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['enzige bwe zirumba ebirime zoonona ebyobulimi byonna']
['enzige bwe zirumba ebirime zoonoona ebyobulimi byonna']
Skipping unsupported language: eng
Skipping unsupported language: eng
Skipping unsupported language: eng
Skipping unsupported language: eng
Skipping unsupported language: eng
Skipping unsupported language: eng
Skipping unsupported language: eng
Skipping unsupported language: eng
Skipping unsupported language: eng
Skipping unsupported language: eng
Skipping unsupported language: eng


Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

adapter.ach.safetensors:   0%|          | 0.00/8.81M [00:00<?, ?B/s]

Error processing language ach: list assignment index out of range
Skipping unsupported language: eng


Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['ningolo kop pol dano pe twero nongo cente maromo ki ma giculu me laro nok ikot', 'dano wenyo onongo gitye ki pa pi kit yoo me kibirubu kwede', 'faki mapol ma tye ki lutic ma kato apar tye ka tamo kit me juku gi idokt', 'cik pe cimo ni latic omyero otii labongo nongo yweyo', 'lupu omyero gikonygi gidwok wel piny igi wek magoba ma ginongo omede', 'pol kare two diyo lee matek', 'lok ma mako lok kom pur omyero kimed rwome', 'omyero wanong pwony ki ijami maaracu ma watimo ikare mukato angec']
['inge ngolokop pol dano pe twero nongo cente marom ki maguculu me laro lok ikot', 'dano weng onongo gitye ki par pi kit yo ma kibirubu kwede', 'factori mapol matye ki lutic makato apar tye katamo kit me juko gi idogtic', 'cik pe cimo ni latic omyero otii labongo nongo yweyo', 'lupur omyero gikonygi gidwok wel piny igi wek magoba maginongo omede', 'polkare two diyo lee matek', 'lok ma mako lok kom pur omyero gimed rwome', 'omyero wanong pwony ki ijami maracu ma watimo ikare mukato angec']


Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['campuni me moo omyero obed ki diro maber me onyo nyugi', 'inge bolo kwir duc kimiyo adwokki kwir pa lucung weny', 'man aye lucungu me kabedo mapat pat me uganda', 'lutino kwan ma gubi penye otyeko nongo mic ma mako penygi me agiki', 'lutuku dilo gunyotu diro tuku mabeco i pyem tuku mukato ni', 'two ma opoto ni onongo kono okelo peko madwong tutwal ento wanguku jwiko peko man', 'omyero giwek lutin gubed agonya me miyo tamgi', 'winyo pido pi lok me butu ki mon tek tek tero kare marom mene']
['kampuni me moo omyero obed ki diro maber me onyo yugi', 'inge bolo kwir ducu gimiyo adwogi kwi pa lucungu weng', 'man aye lucungu me kabedo mapatpat me uganda', 'lutinokwan ma gubipenye otyeko nongo kic mamako penygi me agiki', 'lutuk odilo gunyutu diro tuku mabeco ipyem tuku mukatoni', 'two ma oporo ni onongo kono okelo peko mawdong tutwal ento wanguku jwiko peko man', 'omyero giwek lutino gubed agonya me miyo tamgi', 'winyo pido pi lok me butu ki mon tektek tero kare marom mene']


Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['gwoko kwo pa dano ma icaro pi gengo awano obedo gin mapire tek', 'coyo nying lupur ibuk weko lutela ma ikin gang bedo ki ngec i kom dano mene matye kapur', 'latuku ma pe tuku idye bar ki kome twero balo tuku tyen mapol ento pe nongo lok mo keken', 'cul marac weko dano timo bal', 'ngwec pa lwak obedo ngwec mabor', 'gwake pe bed kacel ki dano ma two pi gengo nyayo two mukene', 'embars me lobo amerika gucwalo pwo botgi pi kony madit ma gumiyo me gwoko yot kom pa dano me uganda', 'tye lok mukene ikit me ngolo kop i lobo man ma pe tiati']
['gwoko kwo pa dano ma icaro pi gengo awano obedo gin ma pire tek', 'coyo nying lupur ibuk weko lutela me kin gang bedo ki ngec i kom dano mene ma tye ka pur', 'latuko ma pe tuko idyer baa ki kome twero balo tuku tyen mapol ento pe nongo lok moo keken', 'cul ma rac weko dano timo bal', 'ngwec pa lwak obedo ngwec ma bor', 'gwoke pe ibed kacel ki dano matwo pi gengo nyayo two mukene', 'embassy me lobo america gicwalo pwoc botgi pi kony madit ma gumiyo me g

Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['kitye ka yubu city manyen mapol me kelo tic cok bot lwak', 'pe biyee ni dano gugam lucucente ma gugwoko niyoo wang madong guweko tic pi mwaka gi madit', 'abili tye kaneno matut lok ma lube ki tim bwomi me mako dano pi gamo cente matye ka time madwong i kampala', 'dano pe ngeyo ngo ma gimito', 'lutela ma itedero kiculu gi magoro ma eni tye ki adwogi marac i kom ticgi', 'daktari omyero gubutu i odi pa lutic wek gubed ogi twero ka konyo lutwo cut cut', 'cine me mucoro omyere niang obed tye i kom kama tye iye dok ki nen kit ma kitiyo kwede pi lweny i kom camcana', 'lubanga dit']
['gitye ka yubu citi manyen mapol me kelo tic cok bot lwak', 'pe gibi ye ni dano ogam nucu cente ma gugwoko nio wang mma dong gu weko tic pi mwaka gi madit', 'abili tye ka neno matut lok ma lube ki tim bwami me mako dano pi gamo cente ma tye ka time madwong ikampala', 'dano pe ngeyo ngo ma gimito', 'lutela ma itedero ki culu gi ma goro ma eni tye ki adwoki marac i kom tic gi', 'daktari omyero gubut i odi pa lutic

Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['mete pa luring yela ikabedo ni omara omedo nyaa pa twon', 'omyero guniang lwak i kom jami ma beco ma gubinongo', 'rwenyo jami bala tic me biacara', 'ye pole tum itim gero', 'joot omyero gunong yoo me cubu peka matye ikingi', 'en otyero digine bot kwarene ma yam con', 'tam me dongo lobo keludongo pa biacara', 'ngom omyero ki co maber inying rwedi ki kome']
['medde pa luring ayella i kabedo ni omaro medo nya pa two ni', 'omyero guniang lwak i kom jami ma beco ma gubi nongo', 'rwenyo jami balo tic me biacara', 'yer pole tum i tim gero', 'jo ot omyero gu nongo yoo me cobo peko matye ikingi', 'en otyero dyegi ne bot kwari ne ma yam con', 'tam me dongo lobo kelo dongo pa biacara', 'ngom omyero ki coo maber inying rwede ki kome']


Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['ngat ma tye iloc tye kalworo pi abili ki mony matye ikabedo', 'ludwo mutaka omyero gugwoke ka gidwoyo lutwo', 'wel mon ma tye ka too ikare me nywal tye kamede', 'tela me ker ki me te kwaro pwod tye ka tic kacelo', 'lupwonye dini omyero gutim biacara makelo itgi cene', 'lamego ni onyutu mapal cwinye bot jonyedo ne', 'yub me kwan nono bibedo tye pi jo ma bi timo maber', 'pyere tek me gwoko tim ki lee matye iye']
['mwony omyero pe gubed i lok me cung iwibye', 'koko weng ma bino i kom ambulance omyero kicwal bot lutic me district', 'wel mon matye ka too ikare me nywal tye ka mede', 'tela me ker ki me tekwaro pwod tye katic kacelo', 'lupwonye dini omyero gutim biacara mo kelo itgi cene', 'lamego ni onyutu lapal cwinye bot lunyodo ne', 'yub me kwan nono bi bedo tye pi jo ma bi timo maber', 'pire tek me gwoko tim ki lee matye iye']


Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['iketo abili ma ocelo dano ni iyweyo me tic', 'jami me kwan mari gibi poko gi bot gangi kwan angwen i kumalo me uganda', 'uring ayela pe gitye ki cam ma oromo', 'gangi kwan omyero gumiyi ki lutino gin acama', 'gmengamente obi medo mac ki i karuma', 'taxi park tye maber tutwal pi tic me biacaro', 'lee  tim kelo cene ki ilobo mukene', 'gitye ki gum me nongo kony me bang odi yadi ki cuk']
['ki keto abili ma ocelo dano ni i yweyo me tic', 'jami me kwan magi kibi poko gi bot gangi kwan angwen i kumalo me uganda', 'luring ayela pegitye ki cam ma oromo', 'gangi kwan omyero gumi ki ki lutino gin acama', 'gamente obi medo mac ki i karuma', 'taxi park tye maber tutwal pi tic me biacara', 'lee tim kelo cene ki ilobo mukene', 'gitye ki gum me nongo kony me me bank odi yadi ki cuk']


Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['en okwayo lwak me mede ki cwako awobe wang ma gutyeko kwan gi', 'nggom eni obedo gin ma dit loyo ma kwaro wa owekowa kede', 'dldul pa abili ma neno lok kom woro aye gutiyo i kom lok eni', 'abili ma otimo kwed i kom bal ma otime ticce ngo', 'nam me pii victoria tut ma rom kwene', 'guloko i kom peko ma lutino kwan nongo igang kwan', 'fery tiyo tyen adek inino acel', 'wel luwil en aye moko dit pa jami ma omyero obed cuk']
['en okwayo lwak me mede ki cwako awobe wang ma gutyeko kwan gi', 'ngom eni obedo gin ma dit loyo ma kwaro wa oweko wa kede', 'dul pa abili ma neno lok kom woro aye otiyo i kom lok eni', 'abili ma timo kwed i kom bal ma otime ticce ngo', 'nam me pii victoria tut ma rom kwene', 'guloko i kom peko ma lutino kwan nongo i gang kwan', 'ferry tiyo tyen adek inino acel', 'wel luwil en aye moko dit pa jami ma omyero obed i cuk']


Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['lureng ayela gupwoyo gamente pi cwalo jo ma mini gi kony', 'jo ma tye iclosapiro angwen kaabicel bubi cako kwan i acoki me dwe ma abicel', 'dwak igu wang pyeni jo mukeno pe gimito dwoko cenni', 'lwako omyero gunong jami ma mite kit ma gamento ociko kwede', 'llupirida pa ngat ma ado tye ikome ni guwaco ni gimito neno ka ada bedo tye ilok eni', 'lwak pwoyo gamente pimiyo cam it gi', 'dano ogoyo laa me gum maber ibot dul eni pi tic ma megi', 'jo ma kiyero gi me tic i kom corona tye katelo tol ki dul pa lupok lok angeya me westnile']
['luring ayella gupwoyo gamente pi cwalo joo ma minigi kony', 'joo matye i class abiro angwen ki abicel gubi cako kwan i acaki me dwe me abicel', 'lwak igi owang pien ni joo mukene onongo pe mito dwoko cene', 'lwak omyero gunong jami ma mite kit ma gamente ocike kwede', 'lupirida pa ngat ma adot tye i kome ni guwaco ni gimito neno ka adaa bedo tye ilok eni', 'lwak pwoyo gamente pi miyo cam itgi', 'dano ogoyo laa me gum maber bot dul eni pi tic ma meg gi', 'j

Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['ka itiyo ki opok ngec itwero cwalo lok bot dano mapol', 'teldini omyero gulub cik dok gubed dano mawor', 'obedo gin maber me miyo pwoc', 'ka itimo gin ma ma pe ber tim ber kwa kica', 'latwo acel acel bibedo ki dano acel keken ma tye ka gwoke iot yat', 'yub me ango ma kiketo ma juku nya pa two corona', 'lutim biacara ma gitye ki twero me tic gitye ka kok pi cilo ma tye icuk', 'watye ki ngat mo ma ginongo ki two coronatin']
['ka itiyo ki upok ngec itwero cwalo lok bot dano mapol', 'lutela dini omyero gulub cik dok gubed dano ma wor', 'obedo gin maber me miyo pwoc', 'ka itimo gin mo ma pe ber tim ber kwa kica', 'latwo acel acel bi bedo ki dano acel keken ma tye kagwoke ioot yat', 'yub ma ango ma ki keto ma juku nya pa two corona', 'lutim biacara ma gitye ki twero me tic gitye ka kok pi cilo matye i cuk', 'watye ki ngat mo ma kinongo ki two corona tin']


Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['two corona tye kanyaa oyut tutwal', 'cam can ye ngo', 'profesor oloko marac i kom kare tac me kwan ne', 'dito me tet yabo yo pi gum manyen me timo biacara', 'watye ka kuru mac me teng gudu ma akibi keto ni', 'pol pa lupwur gi puru jami me acata', 'latela omiyo mic ki lutuku dilo me lobo pi tic maber ma gutimo', 'ocwako odilo gimito tim ma luloc']
['twoo corona tye ka nyaa oyot tutwal', 'camcana aye ngo', 'professor oloko marac i kom kara tac me kwan ne', 'diro me tet yabo yoo pi gum manyen me timo biacara', 'watye ka kuru mac me teng gudu ma kibi keto ni', 'pol pa lupur gi puru jami me acata', 'latela omiyo mic ki lutuku odilo me lobo pi tic ma ber ma gutimo', 'ocwak odilo gimito team ma luloc']


Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

adapter.nyn.safetensors:   0%|          | 0.00/8.80M [00:00<?, ?B/s]

Error processing language nyn: list assignment index out of range


Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['ca otwongo camo gi me pura kon tye kiaduki marac i kom lobowa i lok ma diacara']
['ka otwongo ocamo jami apura eni tye ki adwoki marac i kom lobo wa ilok me biacara']


Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['baingi tibarikugaruka kutunga esente zibakoreise omukuhurirwa kwomushango', 'buri omwe akaba airakiriire aha muringo gwokubaramu ebitabo', 'abakizi basemerra kuaisibaakir aaabaaaira ahamira', 'ebiragiro tibikwikiriza omukozi kukora atarikutunga ekihumuro', 'amasaruura gakyongera nokukyendeza ahabeiyebihingwa', 'akaaabia ahaburra baaiga', 'izereka butezegyebere ebyobuhingi noburiisa bishemereire kwongyerwamu amaani', 'tushemereire kwegyera aha nshobi ezi twakozire kare']
['baingi tibarikugaruka kutunga esente ezi bakoreise omu kuhurirwa kwomushango', 'buri omwe akaba ayerarikiriire aha muringo gwokubaramu ebitabo', 'amakorero againe abakozi abarikuhingura ahari ikumi nibateekateeka okushara aha bakozi baabo', 'ebiragiro tibirikwikiriza omukozi kukora atarikutunga ekihuumuro', 'abahingi bashemereire kugura ebyokukozesa aha beeyi yahansi kugira ngu amagoba geyongyere', 'endwara nizinyangaraza munonga amatungo', 'ebyobuhingi noburiisa bishemereire kwongyerwamu amaani', 'tushemereire kweg

Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['kampuni zamazisa zishemereire kuronda oburyo bwokunaga ebitokoozi', 'ahanyima yakaruuru koona akarikuteerwa ebyarugamu biine okurangirirwa', 'abanu bo besimbireho omu byobutegyeki kuruga omu bicweka byeihanga ebitara bimwe', 'abarikweteekateekyera ebigyezo byahamuheru bahiirwe ebiragiro byokukuratira', 'tiimu ekoreka emyoga mirungi yomu muzaano gwomupiira omu mpaka ezaabaireho', 'oburwaire bukaba nibuza kujanjaara munonga kwonka tukabuzibira', 'bashemereire kwikiriza abaana okuhaya ebitiiso byabo', 'omushango gwokuhambwa nigutwara bwire ki gutakahuriirwe']
['kampuni zamajuta zishemereire kuronda oburyo bwokunaga ebitokoozi', 'ahanyima yakaruuru koona akarikuteerwa ebyarugamu biine kurangirirwa', 'aba nibo besimbireho omu byobutegyeki kuruga omu bicweka byeihanga ebitari bimwe', 'abarikweteekateekyera ebigyezo byahamuheru baheirwe ebiragiro byokukuratira', 'tiimu ekooreka emyoga mirungi yomumuzaano gwomupiira omu mpaka ezabaireho', 'oburwaire bukaba nibuza kujanjaara munonga kwonka tu

Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['okureeba ngahariho ebyokwerinda omu kyaro nekyomugasho', 'ekitabo kyabahingi nikihweera abakuru okumanya abantu abejumbire omu byobuhingi', 'omuteezi womupiira kuruga heru naabaasa kukora enshobi nyingi omu muzaana rugira mwah', 'okushashurwa kubi nikureetera abantu kukora ebihagaro', 'tarasoni nikimanyisa okwiruka orugyendo orurengwa', 'yetantare kuhikana nabantu abaine endwara okubaasa kuzibira okujanjara kwendwara ezimwe', 'ekitebe kikuru kya america omu uganda nikisiimwa ahabwobuhwezi obu kirikuhayo ahabwamagara gabanya uganda', 'hariho ebintu ebitashoborokire omu nkora yaitu yebiragiro']
['okureeba ngu hariho ebyokwerinda omukyaro nekyomugasho', 'ekitabo kyabahingi nikihwera abakuru okumanya abantu abejumbire omubyobuhingi', 'omuteezi womupiira kuruga aheeru naabaasa kukora enshobi nyingi omu muzaano arugiramu aho', 'okushashurwa kubi nikureetera abantu kukora ebihagaro', 'marasoni nikimanyisa okwiruka orugyendo ruraingwa', 'yetantare kuhikaana nabantu abaine endwara okubaasa ku

Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['okwetantara kabi toshemereire kutuura haihi nemishozi', 'abantu tibarikwija kuheebwa ekicweka kyesente zaabo ezibaikirwe kuhisya obu barahuumuzibwe', 'aba poriisi bariyo nibacondooza ripooti yokubuzaho abantu omu bicweka bya kampara', 'abantu tibarikumanya ekibarikwenda', 'bakansara aha disiturikiti nibashashurwe kubi ekirikutambya ekono enkora yokuhisya obuheereza aha bantu', 'abashaho nibetenga kuraara omunju ezihairwe abashaho kugira ngu babaase kuhika ahabarwaire juba nkoku kirikwetengwa', 'esente zeihanga zishemereire kukuratirwa kandi bakazireebuuza kugiranguziremwekukozesibwa kubi', 'ruhanga nowamaani']
['endembo ensya ziriyo nizitandikwaho omwihanga okutunguura omutindo gwempereza', 'abantu tibarikwija kuheebwa ekicweka kyesente zaabo ezibiikirwe kuhisya obu barahuumuzibwe', 'aba poriisi bariyo nibacondooza ripooti yokubuzaho abantu omu bicweka bya kampala', 'abantu tibarikumanya ekibarikwenda', 'ba kansara aha disiturikiti nibashashurwa kubi ekirikutamba eikono enkora yokuhi

Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['okweyongyera kwempungi ezirikwija omwihanga nikubaasa kujanjaaza endwara', 'biine kuhwera abantu okumanya emigasho eibakwija kutunga', 'kufeegwa ebintu nikurabanisamu emirimo ya buzinesi', 'akaruuru bwijo nikahwera omu bwhimukiro', 'abantu abari omu bushwere bashemereire kutereeza enshonga zaabo', 'embuzi ye akagihongyera baisha enkuru', 'ebitiiso byentunguuka nibyongyera entaasya', 'eitaka rishemereire kuhandiikisibwa omubiragiro kandi omu maziina ga mukama waryo']
['okweyongyera kwempungi ezirikwija omu ihanga nikubaasa kujanjaza endwara', 'baine kuhwera abantu okumanya emigasho eibarikwiija kutunga', 'okuferwa ebintu nikurabanisamu emirimo ya bizinesi', 'akaruuru buriijo nikahwera omubwimukiriro', 'abantu abari omubushwere bashemereire kutereeza eshonga zaabo', 'embuzi ye akagihongyera baishenkuru', 'ebiteiso byentunguuka nibyongyera entaasya', 'eitaka rishemereire kuhandikisibwa omu biragiro kandi omu maziina ga mukama waryo']


Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['amaheti gashemerere kwejumba mu byobutegyeki', 'okwetomboitwa kwona okurikukwata aha motoka za amburensi kwine kutwarwa aha bakungu ba disiturikiti', 'omuhendo gwabakazi abarikufa nibazaara gweyongyeire', 'obukama bubiri bwona bukiriho', 'aaauabuaauiiaaakuaaa', 'omukazi akanjura omushaija we omubaziraire be', 'nihaza kubaaho enteekateeka eyokwegyera busha ahari abo abarikwikora kurungi', 'nikyomugasho okurinda egyobuhanga nenyamaizi']
['amahe tigashemereire kwejumba omubyobutegyeki', 'okwetomboitwa kwona okurikukwata aha motoka za ambyurensi kwiine kutwarwa ahabakungu ba disiturikiti', 'omuhendo gwabakazi abarikufa nibazaara gweyongyeire', 'obukama bubiri bwona bukiriho', 'abaahure baine kwejumba omu mirimo yebyobushuubuzi babaase kutunguura entuura yaabo', 'omukazi akanjura omushaija we omu bazaire be', 'nihaza kubaho enteekateeka zokwegyera busha ahari abo abarikukora kurungi', 'nekyomugasho okurinda ebyobuhangwa nenyamaishwa']


Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['ofiisi ahairwe ekihumo aha murogo gwebembezi', 'ebintu byokukozesa omu mashomero nibiija kugaba omu mashomeraana agarikushangwa omucweka kyamateefa ga uganda', 'abantu empungi tibaine byokurya ebirikumara kubaraisyaho', 'amashomero gashemereire kuheebeza habeegi ebyokurya', 'gavumenti neija kwongyera ahabwingi bwamashanyarazi kuruga ahadaamu ya karuma', 'taxi paaka eboneirwe kukoreramu emirimo ya bizinesi', 'enyamishwa nizireita entaasya omu ihanga', 'hariyo za banka amarwariro nobutare']
['ofiisa aheirwe ekihuumuro ahamurimo gwobwebembezi', 'ebintu byokukozesa omumashomer nibiija kugabwa omumashomero ana agarikushangwa omukicweeka kyamatemba ga uganda', 'abantu empungi tibaine byokurya ebirikumara kubabaisaho', 'amashomero gashemereire kuheereza abeegi ekyokurya', 'gavumenti neija kwongyera ahabwingi bwamashanyarazi kuruga aha daamu ya karuma', 'taxi paaka eboneire kukoreramu emirimo ya bizinesi', 'enyamaishwa nizireeta entasya omwihanga', 'hariyo za banka amarwariro nobutare']


Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['akashaba abantu obugurumerikuhwira obuju kusisya obubaraiji okumarayo encom yaabo', 'etaka nikyo kyobugaiga ekirikukirayo obukuru ekishwenkuru yatusigiirwe', 'akakiko ka poriisi akarikukwatisa emicwe kakakora aharubanja rwe', 'omufiisi okubuuririza aha mishango nakora ki', 'oburima bwenyanja narubahari nowinga naki', 'bakahanuura aha buremeezi obwabeegi barikushanga', 'ekitashaaya nikigyenda emirundi shatu buri zooba', 'omuhendo gwabantu abarikugura nigwo gurikwereka ebishemereire kukorwa']
['akashaba abantu okuguma barikuhweera aboojo kuhisya obubareije kumarayo emishomo yaabo', 'eitaka nikyo kyobugaiga ekirikukirayo obukuru eki shwenkuru yatusigiire', 'akakiiko ka poriisi akarikukwatisa emicwe kakakora aharubanja rwe', 'ofiisa orikubuuririza ahamishango naakora ki', 'oburengwa bwenyanja nalubaale nibwingana ki', 'bakahanuura aha buremeezi obu abeegi barikushanga', 'ekitashaaya nikigyenda emirundi eshatu burizooba', 'omuhendo gwabantu abarikugura nigwo gurikworeka ebishemereire kuko

Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['empungi zikasiima ruhanga ahabwokwohereza ookubahwera', 'abeegi abari omu bibiina byakamaririzo nibaija kuza aha mashomero aha ntandikwa yokwa mukaaga', 'abantu bakagira ekiniga ngu abamwe bakaba batarikwenda kugarura ahisente', 'abantu bomukyaro bashemereire kutunga ebirikwetengwa nkoku gavumenti yaraganiise', 'ba puriida bakyababirwa bakagira ngu nibenda kureeba oburinganiza omu mushango', 'abantu nibasiima obuhweezi bwebyokurya kuruga omu gavumenti', 'abantu bashabaire kigombe emigisha ahabwomurimo muhango ogu babakoraire', 'akakiiko akatiruro kiinotaikirizara nekibinda kyabanyamako omu esituraen']
['empungi zikasiima ruhanga ahabwokwohereza abokubahwera', 'abeegi abari omu bibiina ebyakamaririzo nibaija kuza aha mashomero aha ntandikwa yokwa mukaaga', 'abantu bakagira ekiniga ngu abamwe bakaba batarikwenda kugarura esente', 'abantu bomukyaro bashemereire kutunga ebirikwetengwa nkoku gavumenti yaraganiise', 'ba puriida ba kihababirwa bakagira ngu nibenda kureeba oburinganiza omu m

Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['orikwejunisa amakuru nobaasa kuhika ahabantu baingi', 'abebembezi bediini bashemereire kworoba nokugira engyesho nungi', 'nekirungi nkooreka amashemererwa', 'kubaho okukora ekintu ekitashemereire oine kushaba okusaasirwa', 'buri murwaire weena naikirizibwa kugira omwishanja ryomwe', 'nibintu ki ebiteirweho kubaasa kuziibira endwara ya korona kujanjaara', 'abarikuguza ebintu omu katare abayehandiikiise nibetombaitwa ahabwoburofa oburi omu katare', 'twatungayo omurwaire wena eriizooba']
['orikwejunisa amakuru noobaasa kuhika aha bantu baingi', 'abebembezi bediini bashemereire kworoba nokugira engyesho nungi', 'nikirungi okworeka amashemererwa', 'ku orikukora ekintu ekitashemeire oine kushaba okusaasirwa', 'buri murwaire weena naikirizibwa kugira omujanjaabi omwe', 'ni bintu ki ebitairweho kubaasa kuzibira endwara ya korona kujanjaara', 'abarikuguza ebintu omu katare abayehandiikise nibetomboitwa ahabwoburofa oburi omu katare', 'twatungayo omurwaire weena erizooba']


Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['endwara ya korona ryo nejanjaara aho naaho', 'obushumankuzi nenki', 'purofesa akashwijuma orupapura rwe rwobwegyese', 'tekinorogye nereetaho emiringo emisya endiiju eobyobushuubuzi', 'torindiriire nokugomisiriza okuhisya owaratuteireho amatara gahanguota', 'abahingi baingi nibahinga ebyokuguza', 'omwebembezi aheereize tiimu yehanga ekichonco ahabwookukora gye munonga', 'abahagizi bomupiira nibakunda tiimu erikusinga']
['endwara ya korona eriyo neejanjaara ahonaaho', 'obushumankuzi nenki', 'purofeesa akashwijuma orupapura rwe rwobwegyese', 'tekinorogye neereetaho emiringo misya endiijo eyebyobushubuzi', 'turindiriire nokugumisiriza okuhisya obu barituteeraho amataara gahanguuto', 'abahingi baingi nibahinga ebyokuguza', 'omwebembezi ahareize tiimu yeihanga ekiconco ahabwokukora gye munonga', 'abahagizi bomupiira nibakunda tiimu erikusinga']


Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

['enzigye ku zirikutaahirira ebihingwa zishiisha byona ebirikuruga omu buhingi']
['enzigye kuzirikutaahirira ebihingwa nizishiisha byona ebirikuruga omubuhingi']
lug: 0.1510
ach: 0.3025
nyn: 0.3872
Overall WER across the test set: 0.2765


In [None]:
# evaluator = TranscriptionEvaluator("config.yaml")
# evaluator.evaluate()

  self.wer_metric = load_metric("wer")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture

Skipping unsupported language: eng


Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

Skipping unsupported language: eng


Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

Error processing language ach: list assignment index out of range


Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

Error processing language nyn: list assignment index out of range


Some weights of the model checkpoint at Sunbird/sunbird-mms were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Sunbird/sunbird-mms and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

lug: 0.1375
ach: 0.2974
nyn: 0.3918
Overall WER across the test set: 0.2530


In [None]:
evaluator.print_results()

TypeError: TranscriptionEvaluator.print_results() missing 3 required positional arguments: 'wer_by_language', 'total_wer', and 'total_files'