In [6]:
import pandas as pd
import os
import json
import glob
import tgt

# from utils import read_textgrids as rf
import utils.sclite_string_normalizer as sclite_norm

In [7]:
# Read textgrid and extract begin/end time of first three sentences
def read_tg_file_to_df_jasmin(tg_file):

    # Read TextGrid file
    try:
        tg = tgt.io.read_textgrid(tg_file, encoding='utf-8', include_empty_intervals=False)
    except:
        tg = tgt.io.read_textgrid(tg_file, encoding='utf-16', include_empty_intervals=False)

    # Convert TextGrid file to Formatted Table (= df with on each row one interval)
    table = tgt.io.export_to_table(tg, separator='\t')
    formatted_table = [x.split('\t') for x in table.split('\n')]

    return pd.DataFrame(formatted_table[1:], columns = formatted_table[0])

# Read textgrid and extract begin/end time of first three sentences
def read_tg_file_to_df(tg_file, file_encoding):

    # Read TextGrid file
    tg = tgt.io.read_textgrid(tg_file, encoding = file_encoding, include_empty_intervals=False)

    # Convert TextGrid file to Formatted Table (= df with on each row one interval)
    table = tgt.io.export_to_table(tg, separator='; ')
    formatted_table = [x.split('; ') for x in table.split('\n')]
    df = pd.DataFrame(formatted_table[1:], columns = formatted_table[0])

    return df

In [45]:
def readTextGridFile(tgFile, corpus):
    # Read TextGrid file
    if corpus == 'serda' or corpus == 'dart':
        tg_df = read_tg_file_to_df(tgFile, 'latin-1')
    elif corpus == 'jasmin':
        tg_df = read_tg_file_to_df_jasmin(tgFile)
    return tg_df.astype({'start_time':float, 'end_time':float} )

def selectWordTierTextGrid(tg_df, word_tier_name):
    return tg_df[tg_df['tier_name'] == word_tier_name]

def splitTextDFIntoSentences(tg_df_orth_trans):
    sentenceDFList = []
    tg_df_orth_trans = tg_df_orth_trans.reset_index()
    startIDX = tg_df_orth_trans.index[0]
    for idx, row in tg_df_orth_trans.iterrows():
        if row['text'][-1] in ['.', '!', '?']:
            sentenceDFList.append(tg_df_orth_trans.loc[startIDX:idx, :])
            startIDX = idx+1
        # If last sentence does not end with . ? or !
        elif len(tg_df_orth_trans)-1 == idx:
            sentenceDFList.append(tg_df_orth_trans.loc[startIDX:idx, :])
            startIDX = idx+1
    return sentenceDFList

def wordRowToWordSegment(row):
    # Remove annotation tags (*u, *a, etc.), remove all punctuation except the basic punctuation (!-'.?) and all default normalization steps (poss. pronouns, names, spelling errors, write numbers as words)
    # w = sclite_norm.normalize_string(row['text'], annTags=True, all_punct=False, basic_punct=True)
    w = sclite_norm.normalize_string(row['text'], annTags=True, all_punct=False, basic_punct=True, names_as_prompt = False)

    if ' ' in w:
        print(w)

    return {
                "text": w.replace(' ', ''),
                "start": row['start_time'],
                "end": row['end_time'],
                "confidence": 0.0
            }


def turnSentenceDFIntoSegment(sentenceDF, sentenceNr):

    # Remove _ words, these are noisy areas before/after words
    sentenceDF = sentenceDF[sentenceDF['text'] != '_']

    wordsList = list(sentenceDF.apply(wordRowToWordSegment, axis=1))

    return {
            "id": sentenceNr,
            "seek": 0,
            "start": sentenceDF.loc[sentenceDF.index[0], 'start_time'],
            "end": sentenceDF.loc[sentenceDF.index[-1], 'end_time'],
            "text": " ".join([x['text'] for x in wordsList]),
            "words": wordsList
    }


# Prepare JASMIN-NL and JASMIN-VL data

In [50]:
#####################
### Define inputs ###
#####################

# Read recordingsDF
# basePath = '/vol/tensusers2/wharmsen/JASMIN-fluency-features/comp-q-read_nl_age7-11_nat-nonorm'
basePath = '/vol/tensusers2/wharmsen/JASMIN-fluency-features/comp-q-read_vl_age7-11_nat-nonorm'
recDF_path = os.path.join(basePath, '03_metadata/recordingsDF.tsv')

# Set corresponding TextGrid dir with orthographic transcriptions
tgDir = os.path.join(basePath, '00_orig_data/textgrids')
tgExtension = '_updated.TextGrid'

# Set corpus
corpus = 'jasmin'

# Create output dir
outputDir = os.path.join(basePath, '06_manual_fluency_features/json-orth-trans')

# Define output transcription files
otTrans_norm = []
outputTranscriptsNormFile = os.path.join(basePath, '06_manual_fluency_features/ot-norm.csv')


#########################
### Create JSON files ###
#########################

if not os.path.exists(outputDir):
    os.makedirs(outputDir)

# Read recordingsDF
recDF = pd.read_csv(recDF_path, sep= '\t', index_col=0)

for audioID, row in recDF.iterrows():

    # Extract appropriate metadata of each recording
    startTimeRec = row['startTimeFirstSent']
    endTimeRec = row['endTimeLastSent']
    cutStart = row['cutStart']
    cutEnd = row['cutEnd']
    totalDuration = row['duration']

    # Read TextGrid File
    aviLevel = audioID.split('-')[1].split('_')[0].replace('AVI', 'AVI ')
    recordingID = row['recordingID']
    tgFile = os.path.join(tgDir, aviLevel + '/' + recordingID + tgExtension)
    tg_df = readTextGridFile(tgFile, corpus)
    word_tier_name = tg_df.loc[0,'tier_name']
    tg_df_orth_trans = selectWordTierTextGrid(tg_df, word_tier_name)

    # Only JASMIN: select part of TextGrid that belongs to the specific story and correct the time stamps
    tg_df_orth_trans = tg_df_orth_trans[tg_df_orth_trans['start_time']>=cutStart]
    tg_df_orth_trans = tg_df_orth_trans[tg_df_orth_trans['end_time']<=cutEnd]
    tg_df_orth_trans['start_time'] = tg_df_orth_trans['start_time'] - cutStart
    tg_df_orth_trans['end_time'] = tg_df_orth_trans['end_time'] - cutStart

    # Split textDF into sentences
    sentenceDFList = splitTextDFIntoSentences(tg_df_orth_trans)

    # Change each sentenceDF to a segment
    segmentList = [turnSentenceDFIntoSegment(sentenceDF, sentenceNr) for sentenceNr, sentenceDF in enumerate(sentenceDFList)]

    # Save output as Dict
    tgDict = {
        "text" : " ".join([segment['text'] for segment in segmentList]),
        "segments" : segmentList,
    }

    # OUTPUT 1: original transcriptions
    # otTrans_norm.append([audioID, sclite_norm.normalize_string(tgDict['text'])])
    otTrans_norm.append([audioID, sclite_norm.normalize_string(tgDict['text'], names_as_prompt = False)])


    # OUTPUT 2: Write tgDict as json file
    outputFile = os.path.join(outputDir, audioID + '.json')
    # Check if file already exists, if it does, only dump file if current raterNr == 1 (Some files are rated by two raters, we only use the ratings by rater A01 (raterNr = 1))
    with open(outputFile, "w") as outfile:
        json.dump(tgDict, outfile, indent=4)

print(str(len(glob.glob(os.path.join(outputDir, '*.json')))) + ' .json files created in '+ outputDir)

# Write ot-norm file
otTrans_norm_DF = pd.DataFrame(otTrans_norm, columns=['audioID', 'orthographic_transcription'])
otTrans_norm_DF.to_csv(outputTranscriptsNormFile, index=False)
print(outputTranscriptsNormFile, 'file created. This file is fully normalized, in contrast to the generated json files, also the basic punctuation is removed (!-\'.?)')

z egt
b il
l et
j jan
laa t
klim op school
z oek t
klim scho
bus hak
klim op
kind je
mama s
nee ee
s sjort
nee ee
schoola arts
b bril
re reporter
gei nig
uit kiezen
kamer ploeg


KeyboardInterrupt: 

# Prepare SERDA-comp1 data

In [10]:
#####################
### Define inputs ###
#####################

corpus = 'serda'
basePath = '/vol/tensusers2/wharmsen/SERDA-fluency-features/comp1-nonorm'
tgFileDir = os.path.join(basePath, '00_orig_data/textgrids')
tgFileExtension = '.TextGrid'
outputDir = os.path.join(basePath, '06_manual_fluency_features/json-orth-trans')
word_tier_name = 'attempts'

#########################
### Create JSON files ###
#########################

# Create output dir
if not os.path.exists(outputDir):
    os.makedirs(outputDir)

# Define output transcription files
otTrans_norm = []
otTrans_allFiles = []
outputTranscriptsNormFile = os.path.join(basePath, '06_manual_fluency_features/ot-norm.csv')
outputTranscriptsAllFile = os.path.join(basePath, '06_manual_fluency_features/ot-all.csv')

# List all .TextGrid files
# textgridFiles = glob.glob(os.path.join(tgFileDir, 'ZNNGY-story_3-20230110103621268_A01_full_punct*' + tgFileExtension))
textgridFiles = glob.glob(os.path.join(tgFileDir, '*' + tgFileExtension))
print(len(textgridFiles), 'TextGrid files found')

for tgFile in textgridFiles:
    print(tgFile)

    # Read TextGrid file
    tg_df = readTextGridFile(tgFile, corpus)
    tg_df_orth_trans = selectWordTierTextGrid(tg_df, word_tier_name)

    # Split textDF into sentences
    sentenceDFList = splitTextDFIntoSentences(tg_df_orth_trans)

    # Change each sentenceDF to a segment
    segmentList = [turnSentenceDFIntoSegment(sentenceDF, sentenceNr) for sentenceNr, sentenceDF in enumerate(sentenceDFList)]

    # Save output as Dict
    tgDict = {
        "text" : " ".join([segment['text'] for segment in segmentList]),
        "segments" : segmentList,
    }   

    # OUTPUTS
    basename = os.path.basename(tgFile).split('_A0')[0]
    raterNr = os.path.basename(tgFile).split('_A0')[1][0]

    # OUTPUT 1: Original transcriptions
    # otTrans_allFiles.append([os.path.basename(tgFile).replace('.TextGrid', ''), sclite_norm.normalize_string(tgDict['text'])])
    otTrans_allFiles.append([os.path.basename(tgFile).replace('.TextGrid', ''), sclite_norm.normalize_string(tgDict['text'], names_as_prompt=False)])

    recID = basename.split('-2')[0]
    if recID not in ['Z2BYD-story_1', 'YWPWY-story_2', 'YVBRP-story_3'] or raterNr == '1':
        # otTrans_norm.append([basename, sclite_norm.normalize_string(tgDict['text'])])
        otTrans_norm.append([basename, sclite_norm.normalize_string(tgDict['text'], names_as_prompt=False)])


    # OUTPUT 2: Write tgDict as json file
    outputFile = os.path.join(outputDir, basename + '.json')

    # Check if file already exists, if it does, only dump file if current raterNr == 1 (Some files are rated by two raters, we only use the ratings by rater A01 (raterNr = 1))
    if (not os.path.exists(outputFile)) or (os.path.exists(outputFile) and raterNr == '1'):
        with open(outputFile, "w") as outfile:
            json.dump(tgDict, outfile, indent=4)

print(str(len(glob.glob(os.path.join(outputDir, '*.json')))) + ' .json files created in '+ outputDir)

# Write ot-norm file
otTrans_norm_DF = pd.DataFrame(otTrans_norm, columns=['audioID', 'orthographic_transcription']).set_index('audioID').sort_index()
otTrans_norm_DF.to_csv(outputTranscriptsNormFile)
print(outputTranscriptsNormFile, 'file created. This file is fully normalized, in contrast to the generated json files, also the basic punctuation is removed (!-\'.?)')

# Write ot-all file
otTrans_all_DF = pd.DataFrame(otTrans_allFiles, columns=['audioID', 'orthographic_transcription']).set_index('audioID').sort_index()
otTrans_all_DF.to_csv(outputTranscriptsAllFile)
print(outputTranscriptsAllFile, 'file created. This file is fully normalized, in contrast to the generated json files, also the basic punctuation is removed (!-\'.?)')

21 TextGrid files found
/vol/tensusers2/wharmsen/SERDA-fluency-features/comp1-nonorm/00_orig_data/textgrids/QPPY5-story_2-20230120114816570_A01_full_punct.TextGrid
/vol/tensusers2/wharmsen/SERDA-fluency-features/comp1-nonorm/00_orig_data/textgrids/QTB2S-story_3-20221216105258185_A01_full_punct.TextGrid
/vol/tensusers2/wharmsen/SERDA-fluency-features/comp1-nonorm/00_orig_data/textgrids/SMVCS-story_1-20221103091343185_A01_full_punct.TextGrid
/vol/tensusers2/wharmsen/SERDA-fluency-features/comp1-nonorm/00_orig_data/textgrids/VJCMQ-story_2-20221123111647247_A01_full_punct.TextGrid
/vol/tensusers2/wharmsen/SERDA-fluency-features/comp1-nonorm/00_orig_data/textgrids/WHHXX-story_3-20221107134241743_A01_full_punct.TextGrid
/vol/tensusers2/wharmsen/SERDA-fluency-features/comp1-nonorm/00_orig_data/textgrids/XSWMB-story_1-20221216115257809_A01_full_punct.TextGrid
/vol/tensusers2/wharmsen/SERDA-fluency-features/comp1-nonorm/00_orig_data/textgrids/YHTKC-story_1-20221117135859005_A03_orth_punct.TextG

DART test data (annotations of 62 annotations of 60 audio files)

In [None]:
#####################
### Define inputs ###
#####################

corpus = 'dart'
basePath = '/vol/tensusers2/wharmsen/DART-fluency-features/comp2'
tgFileDir = os.path.join(basePath, '00_orig_data/textgrids')
tgFileExtension = '.TextGrid'
outputDir = os.path.join(basePath, '06_manual_fluency_features/json-orth-trans')
word_tier_name = 'attempts'
normalizeNamesAndSpellingErrors = False

#########################
### Create JSON files ###
#########################

# Create output dir
if not os.path.exists(outputDir):
    os.makedirs(outputDir)

# Define output transcription files
otTrans_norm = []
otTrans_allFiles = []
outputTranscriptsNormFile = os.path.join(basePath, '06_manual_fluency_features/ot-norm.csv')
outputTranscriptsAllFile = os.path.join(basePath, '06_manual_fluency_features/ot-all.csv')

# List all .TextGrid files
textgridFiles = glob.glob(os.path.join(tgFileDir, '*' + tgFileExtension))
print(len(textgridFiles), 'TextGrid files found')

for tgFile in textgridFiles:
    print(tgFile)

    # Read TextGrid file
    tg_df = readTextGridFile(tgFile, corpus)
    tg_df_orth_trans = selectWordTierTextGrid(tg_df, word_tier_name)

    # Split textDF into sentences
    sentenceDFList = splitTextDFIntoSentences(tg_df_orth_trans)

    if len(sentenceDFList) == 0:
        print('EMPTY SENTENCE DF')
        print(sentenceDFList)

    # Change each sentenceDF to a segment
    segmentList = [turnSentenceDFIntoSegment(sentenceDF, sentenceNr) for sentenceNr, sentenceDF in enumerate(sentenceDFList)]

    print('SEGMENT LIST')
    print(" ".join([segment['text'] for segment in segmentList]))

    # Save output as Dict
    tgDict = {
        "text" : " ".join([segment['text'] for segment in segmentList]),
        "segments" : segmentList,
    }   

    # OUTPUTS
    basename = os.path.basename(tgFile).split('-A0')[0]
    raterNr = os.path.basename(tgFile).split('-A0')[1][0]

    print(basename, raterNr)

    # OUTPUT 1: Original transcriptions
    # otTrans_allFiles.append([os.path.basename(tgFile).replace('.TextGrid', ''), sclite_norm.normalize_string(tgDict['text'])])
    otTrans_allFiles.append([os.path.basename(tgFile).replace('.TextGrid', ''), sclite_norm.normalize_string(tgDict['text'], names_as_prompt=False)])

    # For
    recID = basename.split('-A')[0]
    print(recID)
    if recID not in ['10104-posttest2-11', '33107-pretest2-11'] or raterNr == '1':
        # otTrans_norm.append([basename, sclite_norm.normalize_string(tgDict['text'])])
        otTrans_norm.append([basename, sclite_norm.normalize_string(tgDict['text'], names_as_prompt=False)])


    # OUTPUT 2: Write tgDict as json file
    outputFile = os.path.join(outputDir, basename + '.json')

    # Check if file already exists, if it does, only dump file if current raterNr == 1 (Some files are rated by two raters, we only use the ratings by rater A01 (raterNr = 1))
    if (not os.path.exists(outputFile)) or (os.path.exists(outputFile) and raterNr == '1'):
        with open(outputFile, "w") as outfile:
            json.dump(tgDict, outfile, indent=4)

print(str(len(glob.glob(os.path.join(outputDir, '*.json')))) + ' .json files created in '+ outputDir)

# Write ot-norm file
otTrans_norm_DF = pd.DataFrame(otTrans_norm, columns=['audioID', 'orthographic_transcription']).set_index('audioID').sort_index()
otTrans_norm_DF.to_csv(outputTranscriptsNormFile)
print(outputTranscriptsNormFile, 'file created. This file is fully normalized, in contrast to the generated json files, also the basic punctuation is removed (!-\'.?)')

# Write ot-all file
otTrans_all_DF = pd.DataFrame(otTrans_allFiles, columns=['audioID', 'orthographic_transcription']).set_index('audioID').sort_index()
otTrans_all_DF.to_csv(outputTranscriptsAllFile)
print(outputTranscriptsAllFile, 'file created. This file is fully normalized, in contrast to the generated json files, also the basic punctuation is removed (!-\'.?)')

58 TextGrid files found
/vol/tensusers2/wharmsen/DART-fluency-features/comp2/00_orig_data/textgrids/18103-posttest1-11-A01.TextGrid
SEGMENT LIST
lesje kijken la hagel jaap sterk bestaat buren duik stoei ka krachken swink boomstam warmst jonge web zieke rechtsaf sap sterhun nieuw juicht vorst stuurdam blij
18103-posttest1-11 1
18103-posttest1-11
/vol/tensusers2/wharmsen/DART-fluency-features/comp2/00_orig_data/textgrids/34106-posttest0-11-A01.TextGrid
sprie kach
SEGMENT LIST
taar taart strik bloemen betaal schoef waarde spruin l klets genoeg sproeien lach spriekach bommen schuur nu pranter schappen hik hink w waai sneeuwpop zoutje kappen vreest melk
34106-posttest0-11 1
34106-posttest0-11
/vol/tensusers2/wharmsen/DART-fluency-features/comp2/00_orig_data/textgrids/1113-posttest2-44-A01.TextGrid
SEGMENT LIST
vlo mooi trompelch toe nicht ze onze slepen raampje tussen zanger flits stopla bewaar nieuwtje schroot buiten vondst bank markt spuw
1113-posttest2-44 1
1113-posttest2-44
/vol/tensuse