In [40]:
import pandas as pd
import glob
import os
import json
import librosa
import textgrids as tg


In [41]:
# CTC segmentation result to TextGrid
input_audio_dir = '/vol/tensusers2/wharmsen/letterster-corpus/test-set/audio'
input_json_dir = '/vol/tensusers2/wharmsen/letterster-corpus/test-set/ctc_kurz_w2v_jg/json_align'
output_tg_dir = '/vol/tensusers2/wharmsen/letterster-corpus/test-set/ctc_kurz_w2v_jg/textgrids_jg'

if not os.path.exists(output_tg_dir):
    os.makedirs(output_tg_dir)

In [42]:
# Read ctc segmentation ASR result
json_list = glob.glob(os.path.join(input_json_dir, '*json'))

for json_file in json_list:

    # Read JSON file
    with open(json_file, 'r') as f:
        asrResult = json.load(f)

    print(asrResult)

[{'text': 'ijs', 'start': 0.16025952896138485, 'end': 0.9303656999001331, 'conf': 0.0}, {'text': 'boom', 'start': 0.9303656999001331, 'end': 1.370538719207723, 'conf': 0.9745110029524023}, {'text': 'dun', 'start': 1.370538719207723, 'end': 1.8507274675432757, 'conf': 0.8642821795850372}, {'text': 'rek', 'start': 1.8507274675432757, 'end': 2.4109476739347535, 'conf': 0.871646658118282}, {'text': 'riet', 'start': 2.7212661867509986, 'end': 3.6514352738015976, 'conf': 0.0}, {'text': 'zout', 'start': 4.441942534953395, 'end': 5.472150944573901, 'conf': 0.0}, {'text': 'paal', 'start': 5.472150944573901, 'end': 6.032371150965379, 'conf': 0.9639161548444203}, {'text': 'kuil', 'start': 6.032371150965379, 'end': 7.032764376664447, 'conf': 0.0}, {'text': 'net', 'start': 7.032764376664447, 'end': 7.352890208888149, 'conf': 0.9223572339178645}, {'text': 'puur', 'start': 7.352890208888149, 'end': 8.233236247503328, 'conf': 0.0}, {'text': 'bak', 'start': 8.543554760319573, 'end': 9.393692389314246, 

In [43]:
jsonFile = json_list[0]
audioFile = os.path.join(input_audio_dir, os.path.basename(jsonFile).replace('.json', '.wav'))

print(audioFile)

/vol/tensusers2/wharmsen/letterster-corpus/test-set/audio/4010103_kaart1A_jan24.wav


In [44]:
# Read ASR result
def readJSON(jsonFile):
    with open(jsonFile, 'r') as f:
        data = json.load(f)
    return data

asrResult = readJSON(jsonFile)

# Convert segment to interval
def convert_segment_to_interval(segment):
    return tg.Interval(segment['text'], segment['start'], segment['end'])

segments_as_intervals = [convert_segment_to_interval(segment) for segment in asrResult]

tgFile = tg.TextGrid()
tgFile.xmin = 0
y, sr = librosa.load(path=audioFile)
tgFile.xmax = librosa.get_duration(y=y) # duration of audio file

tgFile['prompts'] = segments_as_intervals

tgFile.write(os.path.join(output_tg_dir, os.path.basename(jsonFile).replace('.json', '.TextGrid')))



In [45]:
segments_as_intervals

[<Interval text="ijs" xmin=0.16025952896138485 xmax=0.9303656999001331>,
 <Interval text="boom" xmin=0.9303656999001331 xmax=1.370538719207723>,
 <Interval text="dun" xmin=1.370538719207723 xmax=1.8507274675432757>,
 <Interval text="rek" xmin=1.8507274675432757 xmax=2.4109476739347535>,
 <Interval text="riet" xmin=2.7212661867509986 xmax=3.6514352738015976>,
 <Interval text="zout" xmin=4.441942534953395 xmax=5.472150944573901>,
 <Interval text="paal" xmin=5.472150944573901 xmax=6.032371150965379>,
 <Interval text="kuil" xmin=6.032371150965379 xmax=7.032764376664447>,
 <Interval text="net" xmin=7.032764376664447 xmax=7.352890208888149>,
 <Interval text="puur" xmin=7.352890208888149 xmax=8.233236247503328>,
 <Interval text="bak" xmin=8.543554760319573 xmax=9.393692389314246>,
 <Interval text="lijn" xmin=9.393692389314246 xmax=10.153991240845539>,
 <Interval text="min" xmin=10.153991240845539 xmax=10.75422717626498>,
 <Interval text="jeuk" xmin=10.84445917942743 xmax=11.79463613099201>,
 

{'text': 'ijs', 'start': 0.010003932256990679, 'end': 0.8903499708721705, 'conf': 0.0}
{'text': 'boom', 'start': 0.8903499708721705, 'end': 1.370538719207723, 'conf': 0.19550263037255086}
{'text': 'dun', 'start': 1.370538719207723, 'end': 1.8907431965712382, 'conf': 0.16117060235644834}
{'text': 'rek', 'start': 1.8907431965712382, 'end': 2.4509634029627163, 'conf': 0.13261278240848437}
{'text': 'riet', 'start': 2.4509634029627163, 'end': 3.5413920189747, 'conf': 0.0}
{'text': 'zout', 'start': 4.461950399467376, 'end': 5.322091960719041, 'conf': 0.0}
{'text': 'paal', 'start': 5.322091960719041, 'end': 5.9023200316245, 'conf': 0.07227722372632844}
{'text': 'kuil', 'start': 5.9023200316245, 'end': 6.662618883155792, 'conf': 0.0}
{'text': 'net', 'start': 6.662618883155792, 'end': 7.302870547603195, 'conf': 0.0}
{'text': 'puur', 'start': 7.302870547603195, 'end': 8.24324017976032, 'conf': 0.0}
{'text': 'bak', 'start': 8.323468250665778, 'end': 9.303656999001332, 'conf': 0.0}
{'text': 'lijn'

In [None]:
asrOutputStories = '/vol/tensusers2/wharmsen/diagnostics_SERDA/study2.1/word-level/stories-conf/'
asrOutputWords = '/vol/tensusers2/wharmsen/diagnostics_SERDA/study2.1/word-level/word-segments/'

def getDurationAudio(audioFile):
    return librosa.get_duration(path=audioFile)

def getInterval(row, fileType):

    if(fileType == 'stories'):
        prompt = row['prompt']
        phonTrans = lexicon.loc[strman.normalizeText(prompt), 'phonemes']
        text = prompt + ' /' + phonTrans + '/'
        startTime = row['startTime']
        endTime = row['endTimes']

    elif (fileType == 'words'):
        prompt = row['promptWord']
        phonTrans = lexicon.loc[strman.normalizeText(prompt), 'phonemes']
        text = prompt + ' /' + phonTrans + '/'
        startTime = (row['logStart']/1000) #if row['logStart'] != 0.0 else previousEndTime
        endTime = (row['logEnd']/1000) #if row['logEnd'] != 0.0 else previousEndTime + 0.1

    return tg.Interval(text, startTime, endTime)


def createTextGridForAnnotation(audioFile, notTranscribedList, fileType):
    tgFile = tg.TextGrid()
    tgFile.xmin = 0
    tgFile.xmax = librosa.get_duration(path=audioFile) # duration of audio file

    filename = os.path.basename(audioFile).replace('.wav', '')

    # Select corresponding whisperASR result
    if(fileType == 'stories'):
        try:
            asrResultDF = pd.read_csv(os.path.join(asrOutputStories, filename + '.csv'))
        except:
            asrResultDF = pd.DataFrame()
    elif (fileType == 'words'):
        asrResultDF = pd.read_csv(os.path.join(asrOutputWords, filename + '.csv')) 
        asrResultDF = asrResultDF.sort_values(by='promptID', axis=0).reset_index().drop([0], axis=0)


    # Check whether audio file is transcribed by NovoASR
    if (len(asrResultDF) > 0):

        if(fileType == 'words'):

            # Initialize variables
            endTimePreviousInterval = 0
            intervalList = []

            # Create interval for each row in asrResultDF
            for idx, row in asrResultDF.iterrows():
                
                # Compute new interval
                interval = getInterval(row, fileType)

                # Get start time of new interval
                startTimeNewInterval = interval.xmin

                # If silence between this and previous word, add silence interval
                if (endTimePreviousInterval < startTimeNewInterval):
                    intervalList.append(tg.Interval(
                        '', endTimePreviousInterval, startTimeNewInterval))
                intervalList.append(interval)
                endTimePreviousInterval = interval.xmax

                # Add final silence interval
                if (idx == len(asrResultDF) and endTimePreviousInterval < librosa.get_duration(path=audioFile)):
                    print(endTimePreviousInterval)
                    print(librosa.get_duration(path=audioFile))
                    intervalList.append(tg.Interval(
                        '', endTimePreviousInterval, librosa.get_duration(path=audioFile)))
                    
                
            tgFile['prompts'] = intervalList

        elif (fileType == 'stories'):

            nrOfIntervals = len(asrResultDF)
            totalDuration = librosa.get_duration(path=audioFile)
            durationOneInterval = round((totalDuration/nrOfIntervals), 2)

            tgFile['prompts'] = [tg.Interval(row['prompt'] + ' /' + lexicon.loc[strman.normalizeText(row['prompt']), 'phonemes'] + '/', idx*durationOneInterval,
                                             idx*durationOneInterval+durationOneInterval) for idx, row in asrResultDF.iterrows()]

    else:

        tgFile['prompts'] = [tg.Interval(
            '', 0, librosa.get_duration(path=audioFile))]

    # Tier 1: Chunks
    tgFile['chunks'] = [tg.Interval('', 0, librosa.get_duration(path=audioFile))]
                
    # # Bonus tier
    # tgFile['phones'] = [getInterval(word, 'phones') for word in words]
    
    # Tier 2: Attempts
    tgFile['attempts'] = [tg.Interval('', 0, librosa.get_duration(path=audioFile))] #[getInterval(word, 'empty') for word in words]

    # Tier 3: AttemptsPhones
    tgFile['attemptsPhones'] = [tg.Interval('', 0, librosa.get_duration(path=audioFile))]
    
    # Tier 4: Correct
    tgFile['correct'] = [tg.Interval('', 0, librosa.get_duration(path=audioFile))]
    
    # Tier 5: Desription
    tgFile['description'] = [tg.Interval('', 0, librosa.get_duration(path=audioFile))]
    
    # Tier 6: Comments
    tgFile['comments'] = [tg.Interval('', 0, librosa.get_duration(path=audioFile))]

    # else:
    #     notTranscribedList.append(audioFile.split('/')[-2] + ' ' + audioFile.split('/')[-1])
    
    return tgFile, notTranscribedList