# Loading children's speech data

In [92]:
import pandas as pd

data = pd.read_pickle('./asr_data/childrens_speech')
data

Unnamed: 0,text,audio,file_name
0,three,"[0.009646121, 0.021695968, 0.022539534, 0.0212...",english_children/english_words_sentences/11_M_...
1,ten,"[0.027527185, 0.032552492, 0.012983527, 0.0112...",english_children/english_words_sentences/11_M_...
2,nine,"[-0.03888016, -0.057698667, -0.045864653, -0.0...",english_children/english_words_sentences/11_M_...
3,four,"[0.04435131, 0.06784426, 0.058827527, 0.077713...",english_children/english_words_sentences/11_M_...
4,five,"[0.025824236, 0.029695712, 0.02606499, 0.03642...",english_children/english_words_sentences/11_M_...
...,...,...,...
634,and then they saw two frogs,"[-0.025713801, -0.044510987, -0.041728463, -0....",english_children/english_free_speech/files_cut...
635,but one frog didn't came up,"[0.010593225, 0.016391743, 0.012017116, 0.0188...",english_children/english_free_speech/files_cut...
636,and then they saw little baby frogs,"[0.015749954, 0.023017816, 0.018736875, 0.0202...",english_children/english_free_speech/files_cut...
637,and then all of the frogs was looking,"[-0.050608773, -0.07725125, -0.06856466, -0.07...",english_children/english_free_speech/files_cut...


# Loading transcribed data

The models used here are Whisper (tiny, base, and small) and lthe 960h Large wav2vec2

In [93]:

whisper_tiny = pd.read_pickle('./asr_data/tiny_whisper')
whisper_tiny['model'] = 'whisper_tiny'

whisper_base = pd.read_pickle('./asr_data/base_whisper')
whisper_base['model'] = 'whisper_base'

whisper_small = pd.read_pickle('./asr_data/small_whisper')
whisper_small['model'] = 'whisper_small'

wav2vec2_large_960h = pd.read_pickle('./asr_data/wav2vec_large_960h')
wav2vec2_large_960h['model'] = 'wav2vec2_large_960h'

# From one data frame with all models 
all_models = pd.concat([whisper_tiny, whisper_base, whisper_small, wav2vec2_large_960h], axis=0)


In [94]:
whisper_tiny[whisper_tiny['file_name'].str.contains('number')][['references', 'references_clean']]

Unnamed: 0,references,references_clean
5,three,3
6,two,2
7,nine,9
8,seven,7
9,one,one
...,...,...
407,ten,10
408,one,one
409,eight,8
410,five,5


# Calculating the Word Error Rates

In [95]:
import jiwer

wers = pd.DataFrame()
wers['model'] = ['whisper_tiny', 'whisper_base',
                 'whisper_small', 'wav2vec2_large_960h']

conditions = ['port', 'studio', 'nao', 'free_speech', 'words_sentence', 'number']

for i, row in enumerate(wers['model']):
    # Mixed data WERs
    subset = all_models[all_models['model'] == row]
    wers.at[i, 'mixed'] = jiwer.wer(list(subset["references_clean"]),
                                    list(subset["hypothesis_clean"]))
    for condition in conditions:
        # Calculate the WERs for all the conditions
        subset = all_models[(all_models['model'] == row ) & (all_models['file_name'].str.contains(condition))]
        wers.at[i, condition] = jiwer.wer(list(subset["references_clean"]),
                                          list(subset["hypothesis_clean"]))
        
      
# print(f"WER: {wer * 100:.2f} %"
wers


Unnamed: 0,model,mixed,port,studio,nao,free_speech,words_sentence,number
0,whisper_tiny,0.514314,0.426991,0.482222,0.695291,0.507883,0.523357,1.036232
1,whisper_base,0.423166,0.40708,0.437778,0.554017,0.396959,0.460016,1.007246
2,whisper_small,0.317868,0.34292,0.335556,0.440443,0.282095,0.368171,0.913043
3,wav2vec2_large_960h,0.621257,0.537611,0.604444,0.891967,0.591779,0.662708,0.945652


# Calculating the Levenshtein distance 



In [96]:
from Levenshtein import ratio
from Levenshtein import distance
from Levenshtein import hamming
import statistics
lds = pd.DataFrame()
lds['model'] = ['whisper_tiny', 'whisper_base',
                 'whisper_small', 'wav2vec2_large_960h']

conditions = ['port', 'studio', 'nao', 'free_speech', 'words_sentence', 'number']

for i, row in enumerate(lds['model']):
    subset = all_models[all_models['model'] == row]
    ld = []
    for j, sub_row in subset.iterrows():
        ld.append(distance(sub_row["references_clean"],
                           sub_row["hypothesis_clean"]) / len(sub_row["references_clean"]))
    # Get the mean LD
    lds.at[i, 'mixed'] = round(statistics.fmean(ld),2)
 
    for condition in conditions:
        subset = all_models[(all_models['model'] == row) & (
              all_models['file_name'].str.contains(condition))]
        ld = []
        for j, sub_row in subset.iterrows():
            ld.append(distance(sub_row["references_clean"], sub_row["hypothesis_clean"]) /  len(sub_row["references_clean"]))
        lds.at[i, condition] = round(statistics.fmean(ld),2)

lds
print(lds.to_latex(index=False))


\begin{tabular}{lrrrrrrr}
\toprule
              model &  mixed &  port &  studio &  nao &  free\_speech &  words\_sentence &  number \\
\midrule
       whisper\_tiny &   1.74 &  1.97 &    2.38 & 3.24 &         0.37 &            2.47 &    3.59 \\
       whisper\_base &   1.69 &  2.16 &    2.52 & 2.69 &         0.28 &            2.44 &    3.58 \\
      whisper\_small &   1.51 &  1.98 &    2.05 & 2.73 &         0.19 &            2.21 &    3.27 \\
wav2vec2\_large\_960h &   1.53 &  2.02 &    2.08 & 2.42 &         0.36 &            2.15 &    3.08 \\
\bottomrule
\end{tabular}



  print(lds.to_latex(index=False))


In [97]:

nlu_performance = pd.DataFrame()
nlu_performance['model']= ['whisper_tiny', 'whisper_base',
'whisper_small', 'wav2vec2_large_960h']

# for i, row in enumerate(lds['model']):
#     numbers = all_models[
#             (all_models['model'] == row) &
#             (all_models['file_name'].str.contains('number'))
#     ]

numbers = whisper_base[['number' not in x for x in whisper_base['file_name']]][['hypothesis_clean', 'references_clean']]
len(whisper_tiny[whisper_tiny['file_name'].str.contains(
    'number')])


numbers

Unnamed: 0,hypothesis_clean,references_clean
0,squares coming out of the hole,squirrel is coming out the hole
1,the boys are late to focus on the focus of it,the boy is awake the frog the dog is awake
2,the boy when it comes out was fun,the boy went comes out of bed
3,to boycott him down off the window,the boy climbed out of the window
4,i just do not like run out of mud,the dog went out of bed
...,...,...
614,the dog is on the top of the shed,the dog is on top of the shed
625,the fishes in the pond,the fish is in the pond
626,the horse is next to the stable,the horse is next to the stable
627,the horse is behind the cart,the horse is behind the car


In [144]:
# from unittest import expectedFailure
import requests
# TODO: include 'negative exmaples'
# Create an empty list to store predictions with size of ASR model predcitons

nlu_performance = pd.DataFrame()
nlu_performance['model'] = ['whisper_tiny', 'whisper_base',
                            'whisper_small', 'wav2vec2_large_960h']
url = 'http://localhost:5005/model/parse'

for i, row in enumerate(lds['model']):
    numbers = all_models[
        (all_models['model'] == row ) &
        (all_models['file_name'].str.contains('number'))
    ]

    # not_numbers = all_models[
    #     (all_models['model'] == row) &
    #     (['number' not in x for x in all_models['file_name']])
    # ]

    # not_numbers = not_numbers.tail(len(numbers))

    # all_tests = pd.concat([numbers, not_numbers], ignore_index=True)
    # preds = [None] * (len(numbers) + len(not_numbers))
    intent_preds = [None] * (len(numbers))
    entity_preds = [None] * (len(numbers))

    for j, row in enumerate(numbers['hypothesis_clean']):

        payload = {
            "text": row
        }
        try:
            responses = requests.post(url, json=payload).json()
           
        except Exception as e:
            print(e)
        intent_preds[j] = responses['intent']['name'] == 'math_game_count'
     
        if responses['entities']:
            print(responses['entities'])
            print(responses['entities'][0]['entity'] == 'math_game_number')
            entity_preds[j] = any(entity['entity'] == 'math_game_number' for entity in responses['entities'])
        else:
            entity_preds[j] = False
    # print(i)
    nlu_performance.at[i, 'inten_acc'] = sum(intent_preds) / len(intent_preds)
    nlu_performance.at[i, 'entity_acc'] = sum(entity_preds) / len(entity_preds)


    # Pass uttered number hypothothesis to Rasa's NLU 

# url = 'https://260b-41-193-216-247.sa.ngrok.io/model/parse'



[{'entity': 'math_game_number', 'start': 0, 'end': 1, 'confidence_entity': 0.9990130662918091, 'value': '9', 'extractor': 'DIETClassifier'}]
True
[{'entity': 'math_game_number', 'start': 0, 'end': 1, 'confidence_entity': 0.9990130662918091, 'value': '8', 'extractor': 'DIETClassifier'}]
True
[{'entity': 'math_game_number', 'start': 0, 'end': 2, 'confidence_entity': 0.9990130662918091, 'value': '24', 'extractor': 'DIETClassifier'}]
True
[{'entity': 'math_game_number', 'start': 0, 'end': 1, 'confidence_entity': 0.9990130662918091, 'value': '6', 'extractor': 'DIETClassifier'}]
True
[{'entity': 'math_game_number', 'start': 0, 'end': 1, 'confidence_entity': 0.9990130662918091, 'value': '9', 'extractor': 'DIETClassifier'}]
True
[{'entity': 'items', 'start': 18, 'end': 24, 'confidence_entity': 0.5104098320007324, 'value': 'kicked', 'extractor': 'DIETClassifier'}, {'entity': 'items', 'start': 25, 'end': 29, 'confidence_entity': 0.7845388650894165, 'value': 'over', 'extractor': 'DIETClassifier'}

In [145]:
entity_preds

[True,
 True,
 False,
 False,
 False,
 True,
 True,
 True,
 True,
 False,
 False,
 True,
 False,
 False,
 True,
 True,
 True,
 False,
 False,
 True,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 True,
 False,
 False,
 True,
 False,
 True,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 True,
 False,
 False,
 True,
 True,


In [146]:
print(nlu_performance.to_latex(index=False))

\begin{tabular}{lrr}
\toprule
              model &  inten\_acc &  entity\_acc \\
\midrule
       whisper\_tiny &   0.362319 &    0.362319 \\
       whisper\_base &   0.402174 &    0.405797 \\
      whisper\_small &   0.369565 &    0.369565 \\
wav2vec2\_large\_960h &   0.210145 &    0.206522 \\
\bottomrule
\end{tabular}



  print(nlu_performance.to_latex(index=False))


In [32]:
# Get the accuracy of predictions 
sum(preds) / len(preds)

0.40217391304347827