In [1]:
import numpy as np
import torch
from evaluate import load
from datasets import load_from_disk
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
from tqdm import tqdm
import re
from tqdm import tqdm
import os
import pandas as pd

In [2]:
def normalize_text(text: str):
    for char in [".", ",", "!", "?", "(", ")"]:
        text = text.replace(char , " ")
    text = text.replace("ё", "е")
    text = re.sub(' +', ' ', text)
    text = re.sub(r'[^\w\s]','', text)
    text = text.lower().strip()
    return text

In [3]:
wer = load("wer")
cer = load("cer")

In [4]:
MODEL_ID = "jonatasgrosman/wav2vec2-xls-r-1b-russian"
# MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-russian"
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
;

''

In [5]:
basefolder = "/home/docker_current/src/common_voice"
folders = os.listdir(basefolder)
folders

['vad_0.4', 'vad_0.3', 'vad_orig', 'vad_0.1', 'vad_0.0', 'vad_0.5', 'vad_0.2']

In [6]:
df = pd.DataFrame()

In [7]:
for folder in folders:
    print(folder)

    common_voice = load_from_disk(os.path.join(basefolder, folder))

    preds = []
    refs = []

    for i in tqdm(range(len(common_voice))):
        wav = wav = torch.Tensor(common_voice[i]["audio"]["array"])
        gt = common_voice[i]["sentence"]

        # get pred by model
        inputs = processor(wav, sampling_rate=16_000, return_tensors="pt", padding=True)
        inputs = inputs.to(device)

        with torch.no_grad():
            logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits

        predicted_ids = torch.argmax(logits, dim=-1)
        predicted_sentences = processor.batch_decode(predicted_ids)

        preds += predicted_sentences
        refs.append(gt)
        
    preds = [normalize_text(i) for i in preds]
    refs = [normalize_text(i) for i in refs]

    wer_metric = 100 * wer.compute(predictions=preds, references=refs)
    cer_metric = 100 * cer.compute(predictions=preds, references=refs)
    print(f"wer = {wer_metric}, cer = {cer_metric}")
    
    data = {"reserve" : folder.replace(".", "_"),
            "dataset" : "common_voice",
            "wer" : wer_metric,
            "cer" : cer_metric}
    
    data = pd.DataFrame([data])
    df = pd.concat([df, data])

vad_0.4


100%|██████████| 998/998 [02:35<00:00,  6.42it/s]


wer = 15.429159696306783, cer = 5.122633309398738
vad_0.3


100%|██████████| 998/998 [02:26<00:00,  6.82it/s]


wer = 15.712263543945436, cer = 5.364498696194399
vad_orig


100%|██████████| 1000/1000 [02:38<00:00,  6.33it/s]


wer = 15.019255455712452, cer = 4.742151409446592
vad_0.1


100%|██████████| 998/998 [02:08<00:00,  7.74it/s]


wer = 17.732595547548577, cer = 7.503495710668531
vad_0.0


100%|██████████| 998/998 [02:00<00:00,  8.29it/s]


wer = 20.113241539055462, cer = 9.965609765314992
vad_0.5


100%|██████████| 998/998 [02:23<00:00,  6.93it/s]


wer = 15.094582421824734, cer = 4.869430482597029
vad_0.2


100%|██████████| 998/998 [02:12<00:00,  7.52it/s]


wer = 16.40715480633123, cer = 6.192131816635803


In [8]:
df.to_csv("/home/docker_current/src/metrics/huggingsound_commonvoice.csv", index=False)