In [12]:
import re
import math
import pandas as pd
from torchmetrics.text import CharErrorRate, WordErrorRate

In [28]:
def normalize_text(text: str):
    if isinstance(text, float) and math.isnan(text): # if an input text is empty, then we return an empty text too
        return ''
    
    for char in [".", ",", "!", "?", "(", ")"]:
        text = text.replace(char, " ")
    text = text.replace("ё", "е")
    text = re.sub(" +", " ", text)
    text = re.sub(r"[^\w\s]", "", text)
    text = text.lower().strip()
    return text

def normalize_df(df, column_names = ["transcription", "gt", "rescored_transcription"]):
    for colname in column_names:
        df[colname] = df[colname].apply(normalize_text) 
        
    return df


def calculate_error_rates(target, preds):
    wer = WordErrorRate()
    cer = CharErrorRate()
    
    return wer(target=target, preds=preds).item(), cer(target=target, preds=preds).item()

In [10]:
train_df = pd.read_csv("train_df_rescored.csv")
test_df = pd.read_csv("test_df_rescored.csv")

train_df = normalize_df(train_df)
test_df = normalize_df(test_df)

In [13]:
train_df.head(1)

Unnamed: 0,transcription,gt,rescored_transcription
0,демократия неумально подвегается пафу и арабск...,демократия неумолимо продвигается по африке и ...,демократия неумолимо подстегивает пауэлла и ар...


In [32]:
calculate_error_rates(train_df["gt"][0:3000], train_df["transcription"][0:3000])

(0.3041503429412842, 0.1029081866145134)

In [33]:
calculate_error_rates(train_df["gt"][0:3000], train_df["rescored_transcription"][0:3000])

(0.25072795152664185, 0.12694737315177917)