In [None]:
%%capture
!pip install datasets==2.4.0
!pip install transformers==4.30.0
!pip install torchaudio==0.11.0.+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
!pip install jiwer
!pip install pyctcdecode==0.5.0
!pip install https://github.com/kpu/kenlm/archive/master.zip
!pip install huggingface_hub

In [None]:
import huggingface_hub
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from datasets import load_dataset, load_metric, Audio, Dataset
from transformers import pipeline, AutoFeatureExtractor, AutoTokenizer, AutoConfig, AutoModelForCTC, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM
import re
import torch
import unicodedata
from typing import Dict

In [None]:
def normalize_text(text: str) -> str:
    chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\«\»\–\—\’\„\…]'
    text = text.lower()
    text = unicodedata.normalize('NFKC', text)
    text = re.sub(chars_to_ignore_regex, "", text)
    text = " ".join(text.split())
    return text

def main():
    dataset = load_dataset("mozilla-foundation/common_voice_10_0", "pl", use_auth_token=True, split="test")

    #processor = Wav2Vec2Processor.from_pretrained("Osolon/wav2vec2-large-xls-r-300m-pl")
    #decoder = None

    processor = Wav2Vec2ProcessorWithLM.from_pretrained("Osolon/wav2vec2-large-xls-r-300m-pl")
    decoder = processor.decoder
    feature_extractor = processor.feature_extractor
    sampling_rate = feature_extractor.sampling_rate
    tokenizer = processor.tokenizer

    # przepróbkowanie
    dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))

    # ładowanie strumienia ewaluacyjnego
    config = AutoConfig.from_pretrained("Osolon/wav2vec2-large-xls-r-300m-pl")
    model = AutoModelForCTC.from_pretrained("Osolon/wav2vec2-large-xls-r-300m-pl")

    asr = pipeline("automatic-speech-recognition", config=config, model=model, tokenizer=tokenizer,
                   feature_extractor=feature_extractor, decoder=decoder)


    # funkcja mapująca
    def map_to_pred(batch):
        prediction = asr(batch["audio"]["array"], chunk_length_s=None, stride_length_s=None)

        batch["prediction"] = prediction["text"]
        batch["target"] = normalize_text(batch["sentence"])
        return batch

    result = dataset.map(map_to_pred, remove_columns=dataset.column_names)

    # ładowanie metryk
    wer = load_metric("wer")
    cer = load_metric("cer")

    # obliczanie wartości metryk
    wer_result = wer.compute(references=result["target"], predictions=result["prediction"])
    cer_result = cer.compute(references=result["target"], predictions=result["prediction"])

    # pokazuje wynik
    result_str = (
        f"WER: {wer_result}\n"
        f"CER: {cer_result}"
    )
    print(result_str)

if __name__ == "__main__":
    main()



Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/8223 [00:00<?, ?ex/s]

WER: 0.1028663500678426
CER: 0.025851223461143692
