In [14]:
import json
import glob
import pandas as pd
import reversedadapt.getAlignmentsPosttestDart as reversedadapt
import reversedadapt.textToParts as adapttexttoparts

In [2]:
# Read all json whisper results, and save the six relevant ones in whisperDict

jsonFiles = glob.glob('/vol/tensusers5/wharmsen/astla-data/dart-preposttest/slate-data/2-whisper-t-dis/*json')

# All SLaTE results in Dict
whisperDict = {}
for jsonFile in jsonFiles:
    with open(jsonFile, 'r') as f:
        data = json.load(f)
        fileName = jsonFile.split('/')[-1].replace('.json', '')
        whisperDict[fileName] = data

print(whisperDict.keys())

dict_keys(['09eae7df-db7c-491b-b7c4-a7c9fcf62f47', '74be41e8-d940-469f-8ebf-feb90250f9e8', '97549707-f715-4fe5-926c-b272b388e368', 'aa809d8a-45d1-4cc0-a2c2-b40066d2849e', 'd52f2fbc-588a-4002-8c02-772dabdbfc16', 'e5920085-8c1b-415c-b25d-24b5e5a4a2bf'])


In [3]:
# Create Whisper Transcription Dict

whisperTranscriptionDict = {}
for file in whisperDict.keys():
    data = whisperDict[file]
    wordDict = {}
    for segment in data['segments']:
        words = segment['words']
        for word in words:

            label = word['text']

            # If no dislefluency
            if(label != '[*]'):

                start = word['start']
                end = word['end']
                confidence = word['confidence']

                # Save word information in WordDict
                wordDict[label] = {
                    'confidence': confidence,
                    'start': start,
                    'end': end
                    }
    
    whisperTranscriptionDict[file] = wordDict

print(whisperTranscriptionDict['09eae7df-db7c-491b-b7c4-a7c9fcf62f47']['jong'])

#df = pd.DataFrame(whisperOutputList, columns=['wh_word', 'wh_acc', 'wh_start', 'wh_end'])
#df.to_csv('../../astla-data/dart-preposttest/slate-data/5-asr-analysis-results/whisper-word-assessments.csv', index=False)
            


{'confidence': 0.158, 'start': 0.94, 'end': 2.04}


In [4]:
# For each word in the prompt, we need to extract a correct/incorrect assessment from the Whisper ASR-output.
# We will compute this assessment in two ways:
# 1) Align prompt and output
# 2) For each prompt word, check whether it is in the output

# Prompt word is incorrectly pronounced when it is not in the ASR output, or is in the ASR-output, but has a very low confidence score


In [5]:
references = pd.read_csv("../../astla-data/dart-preposttest/data-in-different-formats/1-input-files/reference_wordlist.csv", sep=";")
references = references.set_index('name')
references

Unnamed: 0_level_0,prompt
name,Unnamed: 1_level_1
pretest0,buik web lift grapje spierkracht bal sterk mee...
pretest1,pauw sterren gat proost lijn vlo kijken lus ba...
pretest2,jong lach strik vuur specht keelpijn sla buite...
posttest0,draad strik bloemen meetlat schroef waarde spr...
posttest1,lesje kijken hagel Jaap sterk bestaat dure den...
posttest2,vlo mooi trommel stoep nicht onze slepen raamp...


In [6]:
filelist = glob.glob("/vol/tensusers5/wharmsen/astla-data/dart-preposttest/slate-data/1-teacherAssessments/*.csv")

fileTestDict = {}
for file in filelist:
    parsedName = file.split('/')[-1].split('_')
    if(len(parsedName)==3):
        audio = parsedName[0]
        test = parsedName[1]
        fileTestDict[audio] = test

In [7]:
# Source: alignments.py
import helper_scripts.aligner as aligner

def getPrompt(audio):
    testType = fileTestDict[audio]
    return references.loc[testType, 'prompt']

def checkIfPromptInAsrTransAligned(prompt, asrTransAligned):
    return prompt in asrTransAligned


testDict = [
### false start, substitutions, substitutions
{
    'prompt': "jong lach zieke",
    'asrTrans': "jon jong lang ring"
},

### false start, 3/4 attempt correct, false start
{
    'prompt': "jong onze zieke",
    'asrTrans': "jon jong onzin onz onze z zeker"
},
### second word not pronounced
{
    'prompt': "jong onze zieke",
    'asrTrans': "jong zieke"
},
### second word not pronounced
{
    'prompt': "jong onze zieke",
    'asrTrans': "jon jong z zieke zeker"
}]

for inputPair in testDict:

    reversedadapt.getAlignmentsPromptAsrTrans(inputPair['prompt'], inputPair['asrTrans'])


        # Add begin, end and confidence scores

       aligned_asrTrans reversed_aligned_asrTrans  correct
prompt                                                    
jong           jon jong                  jon jong     True
lach               lang                      lang    False
zieke            ri-n-g                    rin-g-    False
        aligned_asrTrans reversed_aligned_asrTrans  correct
prompt                                                     
jong            jon jong                  jon jong     True
onze               onzin            onzin onz onze     True
zieke   onz onze z zeker                  z z-eker    False
       aligned_asrTrans reversed_aligned_asrTrans  correct
prompt                                                    
jong               jong                      jong     True
onze              -----                      --z-    False
zieke             zieke                    --ieke     True
       aligned_asrTrans reversed_aligned_asrTrans  correct
prompt                                             

In [37]:
# Initialize whisperOutputDF
whisperOutputDF = pd.DataFrame(columns=[
                               'aligned_asrTrans', 'reversed_aligned_asrTrans', 'correct', 'whisperWordIndex', 'confidence', 'begin', 'end', 'filename'])

# For each whisper output file, extract relevant information and add them to whisperOutputDF
for file in whisperTranscriptionDict.keys():
    
    print("Processing ", file, "...")

    # Get ASR transcription and prompt
    asrTranscription = " ".join(whisperTranscriptionDict[file])
    prompt = getPrompt(file)

    # Align the ASR transcription and prompt using reversed ADAPT & non-reversed ADAPT strategy with textToParts to speed up the process
    target_text_parts_list, original_text_parts_list = adapttexttoparts.textToParts(
        prompt.lower(), asrTranscription.lower())
    
    alignmentDFList= []
    for part_idx, part in enumerate(target_text_parts_list):
        alignmentPartDF = reversedadapt.getAlignmentsPromptAsrTrans(
        target_text_parts_list[part_idx].lower(), original_text_parts_list[part_idx].lower())
        alignmentDFList.append(alignmentPartDF)

    alignmentDF = pd.concat(alignmentDFList)
    # alignmentDF = reversedadapt.getAlignmentsPromptAsrTrans(
    #     prompt.lower(), asrTranscription.lower())
    alignmentDF.to_csv(
        '../../astla-data/dart-preposttest/slate-data/2-whisper-t-dis/'+file+'.csv')

    # Match info from alignmentDF with confidence scores, begin and end times

    # Try to find part of words in whisper output that match the prompt. 
    # priorities, first aligned_asrTrans, then prompt, then reversed_aligned_asrTrans
    foundIndexTermsList = []
    confidenceList = []
    startTimeList = []
    endTimeList = []
    for idx, row in alignmentDF.iterrows():
        prompt = row.name
        aligned_asrTrans = row['aligned_asrTrans'].replace('-', '')
        reversed_aligned_asrTrans = row['reversed_aligned_asrTrans'].replace(
            '-', '')
        
        indexWordAlignedAsrTrans = aligned_asrTrans if aligned_asrTrans in whisperTranscriptionDict[file].keys() else '' 
        indexWordPrompt = prompt if prompt in whisperTranscriptionDict[file].keys() else ''
        indexWordReversedAlignedAsrTrans = reversed_aligned_asrTrans if reversed_aligned_asrTrans in whisperTranscriptionDict[file].keys() else '' 
        
        if indexWordAlignedAsrTrans != '':
            foundIndexTerm = indexWordAlignedAsrTrans
            confidence = whisperTranscriptionDict[file][aligned_asrTrans]['confidence']
            startTime = whisperTranscriptionDict[file][aligned_asrTrans]['start']
            endTime = whisperTranscriptionDict[file][aligned_asrTrans]['end']
        elif indexWordPrompt != '':
            foundIndexTerm = indexWordPrompt
            confidence = whisperTranscriptionDict[file][prompt]['confidence']
            startTime = whisperTranscriptionDict[file][prompt]['start']
            endTime = whisperTranscriptionDict[file][prompt]['end']
        elif indexWordReversedAlignedAsrTrans != '':
            foundIndexTerm = indexWordReversedAlignedAsrTrans
            confidence = whisperTranscriptionDict[file][reversed_aligned_asrTrans]['confidence']
            startTime = whisperTranscriptionDict[file][reversed_aligned_asrTrans]['start']
            endTime = whisperTranscriptionDict[file][reversed_aligned_asrTrans]['end']
        else:
            foundIndexTerm = ''
            confidence = 0
            startTime = 0
            endTime = 0

        foundIndexTermsList.append(foundIndexTerm)
        confidenceList.append(confidence)
        startTimeList.append(startTime)
        endTimeList.append(endTime)

    alignmentDF['whisperWordIndex'] = foundIndexTermsList
    alignmentDF['confidence'] = confidenceList
    alignmentDF['begin'] = startTimeList
    alignmentDF['end'] = endTimeList
    alignmentDF['filename'] = [file] * len(alignmentDF)

    whisperOutputDF = pd.concat([whisperOutputDF, alignmentDF])


Processing  09eae7df-db7c-491b-b7c4-a7c9fcf62f47 ...
         aligned_asrTrans reversed_aligned_asrTrans  correct
prompt                                                      
jong                 jong                      jong     True
lach                 lang                      lang    False
strik               strik                     strik     True
vuur                 vuur                      vuur     True
specht             specht                    specht     True
keelpijn         keelpijn                  keelpijn     True
sla                   sla                       sla     True
buiten             buiten                    buiten     True
kous                 kous                      kous     True
         aligned_asrTrans reversed_aligned_asrTrans  correct
prompt                                                      
pink                 pink                      pink     True
schuur             schuur                    schuur     True
huisje             huisje       

In [38]:
def getIndexColumn(row):
    return row['filename'] + '_' + row['prompt']

whisperOutputDF['prompt'] = whisperOutputDF.index
whisperOutputDF['index'] = whisperOutputDF.apply(getIndexColumn, axis=1)
whisperOutputDF = whisperOutputDF.set_index('index')
whisperOutputDF

Unnamed: 0_level_0,aligned_asrTrans,reversed_aligned_asrTrans,correct,whisperWordIndex,confidence,begin,end,filename,prompt
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
09eae7df-db7c-491b-b7c4-a7c9fcf62f47_jong,jong,jong,True,jong,0.158,0.94,2.04,09eae7df-db7c-491b-b7c4-a7c9fcf62f47,jong
09eae7df-db7c-491b-b7c4-a7c9fcf62f47_lach,lang,lang,False,lang,0.282,2.48,2.68,09eae7df-db7c-491b-b7c4-a7c9fcf62f47,lach
09eae7df-db7c-491b-b7c4-a7c9fcf62f47_strik,strik,strik,True,strik,0.463,2.72,4.8,09eae7df-db7c-491b-b7c4-a7c9fcf62f47,strik
09eae7df-db7c-491b-b7c4-a7c9fcf62f47_vuur,vuur,vuur,True,vuur,0.212,4.8,5.56,09eae7df-db7c-491b-b7c4-a7c9fcf62f47,vuur
09eae7df-db7c-491b-b7c4-a7c9fcf62f47_specht,specht,specht,True,specht,0.026,6.04,6.98,09eae7df-db7c-491b-b7c4-a7c9fcf62f47,specht
...,...,...,...,...,...,...,...,...,...
e5920085-8c1b-415c-b25d-24b5e5a4a2bf_nieuw,nieuw,nieuw,True,nieuw,0.6,18.17,18.73,e5920085-8c1b-415c-b25d-24b5e5a4a2bf,nieuw
e5920085-8c1b-415c-b25d-24b5e5a4a2bf_juicht,juicht,juicht,True,juicht,0.23,18.73,19.43,e5920085-8c1b-415c-b25d-24b5e5a4a2bf,juicht
e5920085-8c1b-415c-b25d-24b5e5a4a2bf_vorst,vorst,vorst,True,vorst,0.022,19.83,20.73,e5920085-8c1b-415c-b25d-24b5e5a4a2bf,vorst
e5920085-8c1b-415c-b25d-24b5e5a4a2bf_stuwdam,stuwdam,stuwdam,True,stuwdam,0.027,21.09,22.11,e5920085-8c1b-415c-b25d-24b5e5a4a2bf,stuwdam


In [39]:
whisperOutputDF.to_csv('../../astla-data/dart-preposttest/slate-data/5-asr-analysis-results/whisper-word-assessments-all.csv')

## Output/Conclusion
The output of this script can be found here:

    '../../astla-data/dart-preposttest/slate-data/5-asr-analysis-results/whisper-word-assessments-all.csv'

## Some blocks with example code

In [13]:
# EXAMPLE CODE for parsing whisper dictionary

for file in whisperDict.keys():
    data = whisperDict[file]
    print(data.keys())
    # ['text', 'segments', 'language']
    print(data['segments'])
    # list of Segment (Segment[])
    print(data['segments'][0].keys())
    # ['id', 'seek', 'start', 'end', 'text', 'tokens', 'temperature', 'avg_logprob', 'compression_ratio', 'no_speech_prob', 'confidence', 'words']
    print(data['segments'][0]['words'])
    # List of words (Word[])
    print(data['segments'][0]['words'][0]['text'])
    print(data['segments'][0]['words'][0]['start'])
    print(data['segments'][0]['words'][0]['end'])
    print(data['segments'][0]['words'][0]['confidence'])

dict_keys(['text', 'segments', 'language'])
[{'id': 0, 'seek': 0, 'start': 0.94, 'end': 31.13, 'text': ' jong lang strik vuur specht keelpijn sla buiten kous pink schuur huisje tussen fop zwart koen denk boomstam muis schrift', 'tokens': [50364, 49151, 70, 2265, 3575, 74, 9732, 374, 768, 4701, 803, 28591, 6041, 8039, 758, 6009, 350, 563, 7022, 956, 18470, 46526, 2884, 50119, 283, 404, 11873, 446, 8384, 268, 21285, 9351, 372, 335, 2992, 271, 956, 35742, 51863], 'temperature': 0.0, 'avg_logprob': -0.37514695888612326, 'compression_ratio': 1.263157894736842, 'no_speech_prob': 0.14187833666801453, 'confidence': 0.075, 'words': [{'text': 'jong', 'start': 0.94, 'end': 2.04, 'confidence': 0.158}, {'text': '[*]', 'start': 2.04, 'end': 2.48, 'confidence': 0}, {'text': 'lang', 'start': 2.48, 'end': 2.68, 'confidence': 0.282}, {'text': '[*]', 'start': 2.68, 'end': 2.72, 'confidence': 0}, {'text': 'strik', 'start': 2.72, 'end': 4.8, 'confidence': 0.463}, {'text': 'vuur', 'start': 4.8, 'end': 5.56,