In [1]:
# !pip install -U openai-whisper

In [2]:
from tqdm import tqdm
import re
from evaluate import load
from datasets import load_from_disk
import whisper
import torch
import pandas as pd

In [3]:
wer = load("wer")
cer = load("cer")

In [4]:
def normalize_text(text: str):
    for char in [".", ",", "!", "?", "(", ")"]:
        text = text.replace(char , " ")
    text = text.replace("ё", "е")
    text = re.sub(' +', ' ', text)
    text = re.sub(r'[^\w\s]','', text)
    text = text.lower().strip()
    return text

In [5]:
model = whisper.load_model("large")
options = whisper.DecodingOptions(fp16=True, language="ru")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
;

''

In [6]:
import os
basefolder = "/home/docker_current/src/common_voice"
folders = os.listdir(basefolder)
folders

['vad_0.4', 'vad_0.3', 'vad_orig', 'vad_0.1', 'vad_0.0', 'vad_0.5', 'vad_0.2']

In [7]:
df = pd.DataFrame()

In [8]:
for folder in folders:
    print(folder)
    
    common_voice = load_from_disk(os.path.join(basefolder, folder))

    preds = []
    refs = []

    for i in tqdm(range(len(common_voice))):
        wav = torch.Tensor(common_voice[i]["audio"]["array"])
        gt = common_voice[i]["sentence"]

        predicted_sentences = model.transcribe(wav, fp16=False, language="ru")["text"]
    
        preds.append(predicted_sentences)
        refs.append(gt)
            
    preds = [normalize_text(i) for i in preds]
    refs = [normalize_text(i) for i in refs]

    wer_metric = 100 * wer.compute(predictions=preds, references=refs)
    cer_metric = 100 * cer.compute(predictions=preds, references=refs)
    
    print(f"wer = {wer_metric}, cer = {cer_metric}")
    
    data = {"reserve" : folder.replace(".", "_"),
            "dataset" : "common_voice",
            "wer" : wer_metric,
            "cer" : cer_metric}
    
    data = pd.DataFrame([data])
    df = pd.concat([df, data])

vad_0.4


100%|██████████| 998/998 [14:27<00:00,  1.15it/s]


wer = 12.610989576631065, cer = 5.564793469634557
vad_0.3


100%|██████████| 998/998 [14:39<00:00,  1.13it/s]


wer = 12.945566851113114, cer = 5.921922829825026
vad_orig


100%|██████████| 1000/1000 [14:48<00:00,  1.13it/s]


wer = 12.413350449293967, cer = 5.483171490525125
vad_0.1


100%|██████████| 998/998 [14:32<00:00,  1.14it/s]


wer = 14.940162141294557, cer = 7.924870564226598
vad_0.0


100%|██████████| 998/998 [14:22<00:00,  1.16it/s]


wer = 17.179256208982114, cer = 10.245266618797476
vad_0.5


100%|██████████| 998/998 [15:47<00:00,  1.05it/s]


wer = 12.276412302149016, cer = 5.3682778428630815
vad_0.2


100%|██████████| 998/998 [14:50<00:00,  1.12it/s]

wer = 13.75627332389654, cer = 6.692868750236197





In [9]:
# preds = [normalize_text(i) for i in preds]
# refs = [normalize_text(i) for i in refs]

# wer_metric = 100 * wer.compute(predictions=preds, references=refs)
# cer_metric = 100 * cer.compute(predictions=preds, references=refs)

# print(f"wer = {wer_metric}, cer = {cer_metric}")

In [10]:
df.to_csv("/home/docker_current/src/metrics/whisper_commonvoice.csv", index=False)