In [2]:
import time
from datasets import load_dataset

In [32]:
import pandas as pd
childrens_speech = pd.read_pickle('./asr_data/childrens_speech')

Unnamed: 0,text,audio,file_name
241,ten,"[-0.036601093, -0.03209262, -0.0064707506, -0....",english_children/english_words_sentences/01_M_...
260,three,"[-0.07871861, -0.11767402, -0.08629214, -0.088...",english_children/english_words_sentences/04_M_...
157,nine,"[-0.058444694, -0.068655446, -0.04156081, -0.0...",english_children/english_words_sentences/03_F_...
632,and the dog was holding the tree,"[-0.007677773, -0.013845495, -0.015996212, -0....",english_children/english_free_speech/files_cut...
168,two,"[-0.006614069, -0.0076644183, -0.00919833, -0....",english_children/english_words_sentences/03_F_...
352,three,"[-0.109292276, -0.12106309, 0.024652164, 0.050...",english_children/english_words_sentences/10_M_...
120,ten,"[0.015084553, 0.03357533, 0.028535083, 0.00911...",english_children/english_words_sentences/08_F_...
389,two,"[-0.010832926, 0.09497849, 0.2126958, 0.239577...",english_children/english_words_sentences/09_F_...
495,a boy looking at the frog,"[-0.16036332, -0.24208821, -0.21192491, -0.227...",english_children/english_free_speech/files_cut...
602,the boy looked in the shouting in the hole fro...,"[-0.00055857835, -0.001537919, -0.0012469826, ...",english_children/english_free_speech/files_cut...


In [3]:
dataset = load_dataset(
    "hf-internal-testing/librispeech_asr_demo", "clean", split="validation")

Found cached dataset librispeech_asr_demo (/Users/ryanselesnik/.cache/huggingface/datasets/hf-internal-testing___librispeech_asr_demo/clean/2.1.0/d3bc4c2bc2078fcde3ad0f0f635862e4c0fef78ba94c4a34c4c250a097af240b)


In [4]:
dataset

Dataset({
    features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
    num_rows: 73
})

In [12]:
from transformers import pipeline
import whisper


def transcribe(model, audio):
    """
    Given a model transcribe audio to text

    args: 
        model: str specifiy either whisper or wav2vec2  
    
    """
    if 'wav' in model:
        model = pipeline(task='automatic-speech-recognition',
                     model='facebook/wav2vec2-large-960h')
        text = model(audio) 
        return text
    elif '.en' in model:
        model = whisper.load_model(model)
        text = model.transcribe(audio)
        return text['text']


In [45]:
import pandas as pd
import statistics
from regex import R

rtf_data = pd.DataFrame()
rtf_data['model'] = ['tiny.en', 'base.en', 'small.en']

# Init data
# dataset = dataset[:5]['audio']
dataset = childrens_speech.sample(50)['audio']

# For each model size
for i, model in enumerate(rtf_data['model']):

    inference_times = []
    sample_durations = []
    rtfs = []

    # For each audio file in the dataset 
    for audio in dataset:
        # Start the counter
        # Starting the counter here takes into account the 
        #   - load time
        #   - encoding time
        #   - decoding time
        start = time.perf_counter()
        transcribe(model, audio,)
        inf_time = time.perf_counter() - start

        # Store data
       
        inference_times.append(inf_time)
        SAMPLE_RATE = 22500.0
        sample_duration = len(list(audio)) / SAMPLE_RATE  
        sample_durations.append(sample_duration)
        rtfs.append(inf_time / sample_duration)

    # append mean infereance time to the correspond model size row
    rtf_data.at[i, 'av_inf_time'] = statistics.fmean(inference_times)
    rtf_data.at[i, 'av_duration'] = statistics.fmean(sample_durations)
    rtf_data.at[i, 'duration_std'] = statistics.stdev(sample_durations)

    rtf_data.at[i, 'av_RTF'] = statistics.fmean(rtfs)
    rtf_data.at[i, 'RTF_std'] = statistics.stdev(rtfs)
    




In [46]:
rtf_data


Unnamed: 0,model,av_inf_time,av_duration,duration_std,av_RTF,RTF_std
0,tiny.en,1.755711,2.463612,1.606676,1.032647,0.971657
1,base.en,4.87661,2.463612,1.606676,3.253959,5.616636
2,small.en,13.352413,2.463612,1.606676,8.077425,10.379376
