# Loading children's speech data

In [78]:
import pandas as pd

data = pd.read_pickle('./asr_data/childrens_speech')
data

Unnamed: 0,text,audio,file_name
0,three,"[0.009646121, 0.021695968, 0.022539534, 0.0212...",english_children/english_words_sentences/11_M_...
1,ten,"[0.027527185, 0.032552492, 0.012983527, 0.0112...",english_children/english_words_sentences/11_M_...
2,nine,"[-0.03888016, -0.057698667, -0.045864653, -0.0...",english_children/english_words_sentences/11_M_...
3,four,"[0.04435131, 0.06784426, 0.058827527, 0.077713...",english_children/english_words_sentences/11_M_...
4,five,"[0.025824236, 0.029695712, 0.02606499, 0.03642...",english_children/english_words_sentences/11_M_...
...,...,...,...
634,and then they saw two frogs,"[-0.025713801, -0.044510987, -0.041728463, -0....",english_children/english_free_speech/files_cut...
635,but one frog didn't came up,"[0.010593225, 0.016391743, 0.012017116, 0.0188...",english_children/english_free_speech/files_cut...
636,and then they saw little baby frogs,"[0.015749954, 0.023017816, 0.018736875, 0.0202...",english_children/english_free_speech/files_cut...
637,and then all of the frogs was looking,"[-0.050608773, -0.07725125, -0.06856466, -0.07...",english_children/english_free_speech/files_cut...


# Loading transcribed data

The models used here are Whisper (tiny, base, and small) and lthe 960h Large wav2vec2

In [79]:

whisper_tiny = pd.read_pickle('./asr_data/tiny_whisper')
whisper_tiny['model'] = 'whisper_tiny'

whisper_base = pd.read_pickle('./asr_data/base_whisper')
whisper_base['model'] = 'whisper_base'

whisper_small = pd.read_pickle('./asr_data/small_whisper')
whisper_small['model'] = 'whisper_small'

wav2vec2_large_960h = pd.read_pickle('./asr_data/wav2vec_large_960h')
wav2vec2_large_960h['model'] = 'wav2vec2_large_960h'

# From one data frame with all models 
all_models = pd.concat([whisper_tiny, whisper_base, whisper_small, wav2vec2_large_960h], axis=0)


# Calculating the Word Error Rates

In [107]:
import jiwer

wers = pd.DataFrame()
wers['model'] = ['whisper_tiny', 'whisper_base',
                 'whisper_small', 'wav2vec2_large_960h']

conditions = ['port', 'studio', 'nao', 'free_speech', 'words_sentence', 'number']

for i, row in enumerate(wers['model']):
    # Mixed data WERs
    subset = all_models[all_models['model'] == row]
    wers.at[i, 'mixed'] = jiwer.wer(list(subset["references_clean"]),
                                    list(subset["hypothesis_clean"]))
    for condition in conditions:
        # Calculate the WERs for all the conditions
        subset = all_models[(all_models['model'] == row ) & (all_models['file_name'].str.contains(condition))]
        wers.at[i, condition] = jiwer.wer(list(subset["references_clean"]),
                                          list(subset["hypothesis_clean"]))
        
      
# print(f"WER: {wer * 100:.2f} %"
wers


Unnamed: 0,model,mixed,port,studio,nao,free_speech,words_sentence,number
0,whisper_tiny,0.514314,0.426991,0.482222,0.695291,0.507883,0.523357,1.036232
1,whisper_base,0.423166,0.40708,0.437778,0.554017,0.396959,0.460016,1.007246
2,whisper_small,0.317868,0.34292,0.335556,0.440443,0.282095,0.368171,0.913043
3,wav2vec2_large_960h,0.621257,0.537611,0.604444,0.891967,0.591779,0.662708,0.945652


# Calculating the Levenshtein distance 



In [159]:
from Levenshtein import distance
import statistics
lds = pd.DataFrame()
lds['model'] = ['whisper_tiny', 'whisper_base',
                 'whisper_small', 'wav2vec2_large_960h']

conditions = ['port', 'studio', 'nao', 'free_speech', 'words_sentence', 'number']

for i, row in enumerate(lds['model']):
    subset = all_models[all_models['model'] == row]
    ld = []
    for j, sub_row in subset.iterrows():
        ld.append(float(distance(sub_row["references_clean"] , sub_row["hypothesis_clean"])))
    lds.at[i, 'mixed'] = statistics.fmean(ld)
 
    for condition in conditions:
        subset = all_models[(all_models['model'] == row) & (
              all_models['file_name'].str.contains(condition))]
        ld = []
        for j, sub_row in subset.iterrows():
            ld.append(float(distance(sub_row["references_clean"] , sub_row["hypothesis_clean"])))
        lds.at[i, condition] = statistics.fmean(ld)

lds


Unnamed: 0,model,mixed,port,studio,nao,free_speech,words_sentence,number
0,whisper_tiny,7.72144,4.098684,4.86,7.321739,12.342342,5.261391,4.021739
1,whisper_base,6.2723,3.986842,4.266667,5.8,9.436937,4.58753,3.942029
2,whisper_small,4.649452,3.315789,3.3,4.756522,6.418919,3.707434,3.547101
3,wav2vec2_large_960h,7.749609,4.375,5.033333,7.608696,11.968468,5.503597,3.387681
