In [1]:
import jiwer
import pandas as pd
import string
from collections import Counter
from pathlib import Path
from sqlalchemy import create_engine, Connection, text

# Config

In [2]:
LANGUAGES = ('uk', 'it')
RESULT_DB = 'cv.sqlite3'
engine = create_engine(f'sqlite:///{RESULT_DB}')

# Get references

In [3]:
dfs = []
usecols = ['path', 'sentence']
for language in LANGUAGES:
    language_base_path = Path(language).resolve()
    df = pd.read_csv(language_base_path / 'validated.tsv', sep='\t', usecols=usecols)
    dfs.append(df)

df_commons = pd.concat(dfs, ignore_index=True)
assert df_commons['path'].is_unique
df_commons.set_index('path', inplace=True)

display(df_commons.head())
display(df_commons.tail())

Unnamed: 0_level_0,sentence
path,Unnamed: 1_level_1
common_voice_uk_36819047.mp3,Він хотів стягти її з вагончика.
common_voice_uk_25651279.mp3,Відтоді я встиг об'їхати увесь світ.
common_voice_uk_39572728.mp3,Та звірі так не роблять!
common_voice_uk_39797261.mp3,Перейшов через свою кладку.
common_voice_uk_38190998.mp3,Пробачте: і тут лірика.


Unnamed: 0_level_0,sentence
path,Unnamed: 1_level_1
common_voice_it_28388367.mp3,Il timpano è opera sempre dello stesso autore.
common_voice_it_28388372.mp3,Passare per questo accesso dà un'idea dello sp...
common_voice_it_28388373.mp3,Il circondario di Torino fu creato come suddiv...
common_voice_it_39967131.mp3,"Mitchell, durante una scappatella con una segr..."
common_voice_it_40040061.mp3,"Se ne contano oltre novecento specie, delle qu..."


# Get hypotheses from the various models we want to evaluate

In [4]:
query_hypotheses = text('''
SELECT path, asr_tool, transcription
FROM transcriptions
WHERE transcription IS NOT NULL;
''')
df_hypotheses = pd.read_sql(sql=query_hypotheses, con=engine)
df_hypotheses = df_hypotheses.pivot(values='transcription', columns='asr_tool', index='path')
assert df_hypotheses.index.is_unique
df_hypotheses.head()

asr_tool,autosub,whisper-large-v3
path,Unnamed: 1_level_1,Unnamed: 2_level_1
common_voice_it_17420652.mp3,Prendere nella rete,Prendere nella rete
common_voice_it_17423688.mp3,Il padre non la volle ascoltare prese il morta...,"Il padre non la voglia ascoltare, prese il mo..."
common_voice_it_17426074.mp3,De Vincenzi senza aprirla\nSe la mise nella ta...,"De Vincenzi, senza aprirla, se l'ha misa nell..."
common_voice_it_17428122.mp3,Soldato che fugge buono per un'altra volta,Soldato che fugge è buono per un'altra volta.
common_voice_it_17429497.mp3,Le corna sono praticamente un prolungamento os...,Le corna sono praticamente un prolungamento o...


# Join references and hypotheses

In [5]:
df_joined = df_commons.join(other=df_hypotheses, on='path', validate='1:1').dropna()
df_joined['language'] = df_joined.index.map(lambda x: x.removeprefix('common_voice_')[0:2])
df_joined

Unnamed: 0_level_0,sentence,autosub,whisper-large-v3,language
path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
common_voice_uk_23559291.mp3,Є й такі: не найде толку – буде тихо; Не найде...,Не найде толку\nБуде Тихо\nНе найдеш точки,"Є й такі, не найдеш толку – буде тихо, не най...",uk
common_voice_uk_23568344.mp3,"Тим важча хвороба божевілля володіє кожним, чи...",Чим важча хвороба божевілля володіє кожним\nВі...,"Тим важча хвороба божевілля володіє кожним, ч...",uk
common_voice_uk_32413787.mp3,Чи ти з розумом?,Чи ти з розумом,Чити з розумом.,uk
common_voice_uk_37910527.mp3,Спочатку діло йшло погано.,Спочатку діло йшло погано,Спочатку діло йшло погано.,uk
common_voice_uk_38200511.mp3,Тобі нема чого ховатись.,Тобі нема чого ховати,Тобі нема чого ховатись.,uk
...,...,...,...,...
common_voice_it_25890970.mp3,"Ha una sorella, Phoebe.",Ha una sorella P,"Ha una sorella, Febe.",it
common_voice_it_25919077.mp3,È Considerata una delle più importanti tifoser...,È considerata una delle più importanti tifoser...,È considerata una delle più importanti tifose...,it
common_voice_it_25920814.mp3,Stazione meteorologica di Piacenza,Stazione meteorologica di Piacenza,Stazione Meteorologica di Piacenza,it
common_voice_it_25980508.mp3,"Cooper, in seguito, definirà la mente di Earle...",In seguito definirà la mente di Earl brillante,"Cooper, in seguito, definirà la mente di Earl...",it


# Preprocess

For a fairer comparison between models. This preprocessing is rudimentary and perhaps somewhat opinionated but it boils down to this:
* Conver text to lowercase
* Remove punctuation
* Replace line breaks with spaces

In [6]:
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.replace('\n', ' ')
    return text

df_joined_preprocessed = df_joined.copy().map(preprocess_text)
df_joined_preprocessed

Unnamed: 0_level_0,sentence,autosub,whisper-large-v3,language
path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
common_voice_uk_23559291.mp3,є й такі не найде толку – буде тихо не найде ж...,не найде толку буде тихо не найдеш точки,є й такі не найдеш толку – буде тихо не найде...,uk
common_voice_uk_23568344.mp3,тим важча хвороба божевілля володіє кожним чим...,чим важча хвороба божевілля володіє кожним він...,тим важча хвороба божевілля володіє кожним чи...,uk
common_voice_uk_32413787.mp3,чи ти з розумом,чи ти з розумом,чити з розумом,uk
common_voice_uk_37910527.mp3,спочатку діло йшло погано,спочатку діло йшло погано,спочатку діло йшло погано,uk
common_voice_uk_38200511.mp3,тобі нема чого ховатись,тобі нема чого ховати,тобі нема чого ховатись,uk
...,...,...,...,...
common_voice_it_25890970.mp3,ha una sorella phoebe,ha una sorella p,ha una sorella febe,it
common_voice_it_25919077.mp3,è considerata una delle più importanti tifoser...,è considerata una delle più importanti tifoser...,è considerata una delle più importanti tifose...,it
common_voice_it_25920814.mp3,stazione meteorologica di piacenza,stazione meteorologica di piacenza,stazione meteorologica di piacenza,it
common_voice_it_25980508.mp3,cooper in seguito definirà la mente di earle b...,in seguito definirà la mente di earl brillante,cooper in seguito definirà la mente di earl b...,it


# Evaluation function

In [7]:
metrics_description = {
    'number_of_references': 'Number of reference sentences or parts of texts that were evaluated',
    'insertions': 'Extra words provided by the ASR model',
    'deletions': 'Words not transcribed by the ASR model',
    'substitutions': 'Reference words that the ASR model replaced with other ones',
    'hits': 'Correct words transcribed by the ASR model',
    'mer': ('Match Error Rate: Percentage of incorrect words (insertions, deletions, '
            'substitutions) divided by the total number of words in the reference'),
    'wil': 'Word Information Lost: Amount of information lost during transcription',
    'wip': 'Word Information Preserved: Amount of information preserved during transcription',
    'wer': 'Word Error Rate: Overall percentage of words that are incorrect'
}


def evaluate(references: list[str], hypotheses: list[str]) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    out = jiwer.process_words(
        reference=references,
        hypothesis=hypotheses
    )
    
    # score per sentences
    sentences_score_data = []
    references, hypotheses, alignements_list = out.references, out.hypotheses, out.alignments
    assert len(references) == len(hypotheses) == len(alignements_list)
    for reference, hypothesis, alignement_list in zip(references, hypotheses, alignements_list):
    
        alignements_operations = Counter(map(lambda x: x.type, alignement_list))
        alignements_operations.pop('equal', None)
        alignements_operations = None if not alignements_operations else alignements_operations
        sentences_score_record = dict(reference=tuple(reference),
                                      hypothesis=tuple(hypothesis),
                                      alignements_operations=alignements_operations)
        sentences_score_data.append(sentences_score_record)

    df_details = (pd.DataFrame(sentences_score_data)
                  .rename_axis(index='ref_ix')
                  .set_index('reference', append=True))

    # score total
    score_data_absolute = [
        {'metric': 'number_of_references', 'value': len(references)},
        {'metric': 'insertions', 'value': out.insertions},
        {'metric': 'deletions', 'value': out.deletions},
        {'metric': 'substitutions', 'value': out.substitutions},
        {'metric': 'hits', 'value': out.hits}
    ]
    df_score_absolute = pd.DataFrame(score_data_absolute).set_index('metric')

    score_data_rates = [
        {'metric': 'mer', 'value': out.mer},
        {'metric': 'wil', 'value': out.wil},
        {'metric': 'wip', 'value': out.wip},
        {'metric': 'wer', 'value': out.wer}
    ]
    df_score_rates = pd.DataFrame(score_data_rates).set_index('metric')
    return df_details, df_score_absolute, df_score_rates

# Evaluate

In [8]:
model_names = df_joined_preprocessed.columns.drop(['sentence', 'language']).tolist()
languages = df_joined_preprocessed['language'].unique().tolist()
print(f'{model_names=}\n{languages=}')

model_names=['autosub', 'whisper-large-v3']
languages=['uk', 'it']


## Compute scores accross languages

In [9]:
df_details_list = []
df_total_list = []
df_rates_list = []


for model in model_names:
    references = df_joined_preprocessed['sentence'].tolist()
    hypotheses = df_joined_preprocessed[model].tolist()
    df_details, df_total, df_rates = evaluate(references=references, hypotheses=hypotheses)

    df_details.columns = pd.MultiIndex.from_product([df_details.columns] + [[model]])
    df_total.columns = pd.MultiIndex.from_product([df_total.columns] + [[model]])
    df_rates.columns = pd.MultiIndex.from_product([df_rates.columns] + [[model]])

    df_details_list.append(df_details)
    df_total_list.append(df_total)
    df_rates_list.append(df_rates)

    
df_details = pd.concat(df_details_list, axis=1)
df_total = pd.concat(df_total_list, axis=1)
df_total['description'] = df_total.index.map(metrics_description)
df_total.set_index('description', append=True, inplace=True)

df_rates = pd.concat(df_rates_list, axis=1)
df_rates['description'] = df_rates.index.map(metrics_description)
df_rates.set_index('description', append=True, inplace=True)

## Compute scores per language

In [10]:
df_total_lang_list = []
df_rates_lang_list = []

# the detail view makes little sense here, since we already have all the data in `df_details`
for language in languages:
    language_slice = df_joined_preprocessed['language'] == language
    for model in model_names:
        references = df_joined_preprocessed.loc[language_slice, 'sentence'].tolist()
        hypotheses = df_joined_preprocessed.loc[language_slice, model].tolist()
        _, df_total_lang, df_rates_lang = evaluate(references=references, hypotheses=hypotheses)

        df_total_lang.columns = pd.MultiIndex.from_product([[language]] + [df_total_lang.columns] + [[model]])
        df_rates_lang.columns = pd.MultiIndex.from_product([[language]] + [df_rates_lang.columns] + [[model]])

        df_total_lang_list.append(df_total_lang)
        df_rates_lang_list.append(df_rates_lang)

df_total_lang = pd.concat(df_total_lang_list, axis=1)
df_total_lang['description'] = df_total_lang.index.map(metrics_description)
df_total_lang.set_index('description', append=True, inplace=True)

df_rates_lang = pd.concat(df_rates_lang_list, axis=1)
df_rates_lang['description'] = df_rates_lang.index.map(metrics_description)
df_rates_lang.set_index('description', append=True, inplace=True)

# Results 🚀

For reference, below are official benchmarks (for the Word error rate) from OpenAI. Please note that I truncated a lot of languages below Ukrainian, because the list is quite long. Please refer to the original picture on [Whisper's GitHub repo](https://github.com/openai/whisper).

![official benchmark whisper](official_benchmark_whisper.png)

Differences with my benchmark may be due to a variety of factors:
* insufficient or incorrect preprocessing
* small size of sample
* dataset version (I used common voices 17 and not common voices 15 like above)
* various updates of the model and library since the official benchmark was published
* usage of faster-whisper that may introduce minor discrepancies

## Accross languages

In [11]:
df_total

Unnamed: 0_level_0,Unnamed: 1_level_0,value,value
Unnamed: 0_level_1,Unnamed: 1_level_1,autosub,whisper-large-v3
metric,description,Unnamed: 2_level_2,Unnamed: 3_level_2
number_of_references,Number of reference sentences or parts of texts that were evaluated,1987,1987
insertions,Extra words provided by the ASR model,124,133
deletions,Words not transcribed by the ASR model,1643,284
substitutions,Reference words that the ASR model replaced with other ones,1633,1094
hits,Correct words transcribed by the ASR model,13557,15455


In [12]:
df_rates.style.format('{:,.2%}'.format)

Unnamed: 0_level_0,Unnamed: 1_level_0,value,value
Unnamed: 0_level_1,Unnamed: 1_level_1,autosub,whisper-large-v3
metric,description,Unnamed: 2_level_2,Unnamed: 3_level_2
mer,"Match Error Rate: Percentage of incorrect words (insertions, deletions, substitutions) divided by the total number of words in the reference",20.05%,8.91%
wil,Word Information Lost: Amount of information lost during transcription,28.70%,14.94%
wip,Word Information Preserved: Amount of information preserved during transcription,71.30%,85.06%
wer,Word Error Rate: Overall percentage of words that are incorrect,20.20%,8.98%


## Per language

In [13]:
df_total_lang

Unnamed: 0_level_0,Unnamed: 1_level_0,uk,uk,it,it
Unnamed: 0_level_1,Unnamed: 1_level_1,value,value,value,value
Unnamed: 0_level_2,Unnamed: 1_level_2,autosub,whisper-large-v3,autosub,whisper-large-v3
metric,description,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3
number_of_references,Number of reference sentences or parts of texts that were evaluated,996,996,991,991
insertions,Extra words provided by the ASR model,47,76,77,57
deletions,Words not transcribed by the ASR model,1202,237,441,47
substitutions,Reference words that the ASR model replaced with other ones,985,724,648,370
hits,Correct words transcribed by the ASR model,4820,6046,8737,9409


In [14]:
df_rates_lang.style.format('{:,.2%}'.format)

Unnamed: 0_level_0,Unnamed: 1_level_0,uk,uk,it,it
Unnamed: 0_level_1,Unnamed: 1_level_1,value,value,value,value
Unnamed: 0_level_2,Unnamed: 1_level_2,autosub,whisper-large-v3,autosub,whisper-large-v3
metric,description,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3
mer,"Match Error Rate: Percentage of incorrect words (insertions, deletions, substitutions) divided by the total number of words in the reference",31.67%,14.64%,11.77%,4.80%
wil,Word Information Lost: Amount of information lost during transcription,43.34%,23.80%,17.90%,8.40%
wip,Word Information Preserved: Amount of information preserved during transcription,56.66%,76.20%,82.10%,91.60%
wer,Word Error Rate: Overall percentage of words that are incorrect,31.88%,14.80%,11.87%,4.82%


# Speed performance

In [15]:
query_speed = text('''
SELECT asr_tool,
       COUNT(path) AS number_of_clips,
       SUM(duration) AS total_duration,
       AVG(duration) AS average_duration_per_clip
FROM transcriptions
WHERE duration IS NOT NULL
GROUP BY asr_tool;
''')
df_speed = pd.read_sql(sql=query_speed, con=engine, index_col='asr_tool')
df_speed.style.format({
    'total_duration': '{:,.2f} seconds'.format,
    'average_duration_per_clip':  '{:,.2f} seconds'.format
})

Unnamed: 0_level_0,number_of_clips,total_duration,average_duration_per_clip
asr_tool,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
autosub,1987,"5,154.89 seconds",2.59 seconds
whisper-large-v3,2000,"1,614.97 seconds",0.81 seconds
