In [45]:
import glob
import pandas as pd
import os
import json
import numpy as np

In [15]:
# 1. Read ASR result
# 2. Extract features
# 3. Write dataframe

In [16]:
jsonAsrResultsDir = '/vol/bigdata3/datasets3/dutch_child_audio/dart/preposttest_final/05_asr_experiments/whispert_dis_prompts/json-asr-results'

In [146]:
def readWhisperToutputJSON(jsonFile):
    with open(jsonFile, 'r') as f:
        data = json.load(f)
    return data

def getDescriptiveStatistics(scores):
    scores_dict = pd.Series(scores).describe().to_dict()
    scores_dict['IQR'] = round(scores_dict['75%'] - scores_dict['25%'],3)
    return scores_dict

def itemDurationAndConfidenceAnalysis(items):
    if len(items) > 0:
        durations = [round(item['end']-item['start'],3) for item in items]
        stats_durations = getDescriptiveStatistics(durations)

        conf = [item['confidence'] for item in items]
        stats_conf = getDescriptiveStatistics(conf)
    else:
        stats_durations = getDescriptiveStatistics([np.nan,np.nan,np.nan,np.nan])
        stats_conf = getDescriptiveStatistics([np.nan,np.nan,np.nan,np.nan])

    return stats_durations, stats_conf

def pausesAnalysis(items):
    pauses_end = [item['start'] for item in items][1:] # remove first start time, which represents end of initial pause
    pauses_start = [item['end'] for item in items][:-1] # remove last end time, which represents start of final pause
    
    # Get durations of pauses
    pauses_durations = np.array(pauses_end)-np.array(pauses_start)

    # Remove pauses with a duration of 0.0
    pauses_durations_without0 = [pause_dur for pause_dur in pauses_durations if pause_dur != 0.0]

    # Compute statistic measures of pauses
    stats_pauses_durations = getDescriptiveStatistics(pauses_durations_without0)

    return stats_pauses_durations, pauses_durations
    

def changeNamesOfKeys(outputDict, prefix):
    return dict((prefix+key, value) for (key, value) in outputDict.items())

def getReadingFluencyStatistics(words, pauses_durations, pauses2_durations):

    # Total duration (excl. begin and end silence)
    startTimeReading = [word['start'] for word in words][0]
    endTimeReading = [word['end'] for word in words][-1]
    totalReadingTime = endTimeReading - startTimeReading
    speechRate = round(len(words)/(totalReadingTime/60), 3)
    phonationTime = totalReadingTime - pauses_durations.sum()
    phonationTime2 = totalReadingTime - pauses2_durations.sum()
    articulationRate = round(len(words)/(phonationTime/60), 3)
    articulationRate2 = round(len(words)/(phonationTime2/60), 3)

    return {
        'totalReadingTime' : totalReadingTime,
        'speechRate(WPM)' : speechRate,
        'phonationTime' : phonationTime,
        'phonationTime2' : phonationTime2,
        'articulationRate' : articulationRate,
        'articulationRate2' : articulationRate2,
    }

In [147]:
jsonAsrResultsList = glob.glob(os.path.join(jsonAsrResultsDir, '*.json'))

outputDict = {}
for jsonAsrResult in jsonAsrResultsList[0:3]:

    # Get basename of file
    basename = os.path.basename(jsonAsrResult).replace('.json', '')

    # 1. Read JSON file
    data = readWhisperToutputJSON(jsonAsrResult)

    # 2. Extract features from JSON file
    text = data['text']
    segments = data['segments']

    # SEGMENTS
    nrOfSegments = len(segments)

    # ITEMS
    # Extract items, these can either be recognized words or disfluencies [*]
    items = [segment['words'] for segment in segments]
    items_flatten = [word for words_segment in items for word in words_segment]

    # ITEMS - DISFLUENCIES
    disfluencies = [item for item in items_flatten if item['text'] == "[*]"]
    stats_durations_disfluencies, stats_conf_disfluencies = itemDurationAndConfidenceAnalysis(disfluencies)
    stats_durations_disfluencies = changeNamesOfKeys(stats_durations_disfluencies, 'disfl_dur_')
    stats_conf_disfluencies = changeNamesOfKeys(stats_conf_disfluencies, 'disfl_conf_')
    
    # ITEMS - WORDS
    words = [item for item in items_flatten if item['text'] != "[*]"]
    stats_durations_words, stats_conf_words = itemDurationAndConfidenceAnalysis(words)
    stats_durations_words = changeNamesOfKeys(stats_durations_words, 'words_dur_')
    stats_conf_words = changeNamesOfKeys(stats_conf_words, 'words_conf_')

    # ITEMS - PAUZES I (disfluencies are pauses)
    stats_pauses_durations, pauses_durations = pausesAnalysis(words)
    stats_pauses_durations = changeNamesOfKeys(stats_pauses_durations, 'pauses_dur_')

    # ITEMS - PAUZES II (disfluencies are not pauses)
    stats_pauses2_durations, pauses2_durations = pausesAnalysis(items_flatten)
    stats_pauses2_durations = changeNamesOfKeys(stats_pauses2_durations, 'pauses2_dur_')

    # Overall reading fluency statistics
    stats_reading_fluency = getReadingFluencyStatistics(words, pauses_durations, pauses2_durations)

    outputDict[basename] = {**stats_reading_fluency, **stats_durations_disfluencies, **stats_conf_disfluencies, **stats_durations_words, **stats_conf_words, **stats_pauses_durations, **stats_pauses2_durations}


In [148]:
pd.DataFrame(outputDict).transpose()

Unnamed: 0,totalReadingTime,speechRate(WPM),phonationTime,phonationTime2,articulationRate,articulationRate2,disfl_dur_count,disfl_dur_mean,disfl_dur_std,disfl_dur_min,...,pauses_dur_IQR,pauses2_dur_count,pauses2_dur_mean,pauses2_dur_std,pauses2_dur_min,pauses2_dur_25%,pauses2_dur_50%,pauses2_dur_75%,pauses2_dur_max,pauses2_dur_IQR
10101_posttest0_11.json,154.5,12.039,34.41,52.26,54.054,35.591,14.0,1.475,1.475871,0.28,...,3.008,30.0,3.408,4.02256,0.04,1.305,2.355,3.5075,17.41,2.203
10101_posttest2_11.json,130.79,9.634,16.4,33.14,76.829,38.021,21.0,0.812381,0.545389,0.17,...,3.662,21.0,4.65,4.337457,1.23,2.19,3.22,5.66,20.48,3.47
10101_pretest0_11.json,4.42,40.724,0.86,0.86,209.302,209.302,0.0,,,,...,0.92,2.0,1.78,1.301076,0.86,1.32,1.78,2.24,2.7,0.92


6