In [5]:
import pandas as pd
import textgrids as tg
import glob
from sklearn.metrics import cohen_kappa_score, matthews_corrcoef


In [6]:
def tier2List(tierName, file):
    phonTransList = []
    for interval in file[tierName]:
        phonTransList.append(interval.text.strip())
    return phonTransList


In [7]:
basepath = '../../astla-data/dart-preposttest/slate-data/6-phonetic-transcriptions/annotated-textgrids/'
annotators = ['wieke', 'loes', 'carlos']  # , 'carlos'

outputDF = pd.DataFrame()
for annotator in annotators:
    filelist = glob.glob(basepath+annotator+'/*.TextGrid')

    for tgfile in filelist:
        file = tg.TextGrid(tgfile)

        fileID = tgfile.split('/')[-1].replace('.TextGrid', '')

        indexList = [fileID + '_' +promptWord for promptWord in tier2List('prompt', file)]
        phonTransAnnotatorList = tier2List('phoneticTranscription', file)
        commentsList = tier2List('comments', file)
        promptPhonesList = tier2List('prompt_phonemes', file)

        #Save computed data in dataframe
        for idx in range(len(indexList)):
            indexName = indexList[idx]
            phonTransAnnotator = phonTransAnnotatorList[idx]
            comment = commentsList[idx]
            promptPhonesList[idx]
            if(annotator == 'wieke'):
                outputDF.loc[indexName, 'prompt_in_phonemes'] = promptPhonesList[idx]
            outputDF.loc[indexName, annotator] = phonTransAnnotatorList[idx]
            outputDF.loc[indexName, 'comments_'+annotator] = commentsList[idx]        

# Remove empty rows
outputDF = outputDF[outputDF['prompt_in_phonemes'] != '']
outputDF.head(3)


Unnamed: 0,prompt_in_phonemes,wieke,comments_wieke,loes,comments_loes,carlos,comments_carlos
09eae7df-db7c-491b-b7c4-a7c9fcf62f47_jong,j O N,j O N,,j O N,,d j O N,
09eae7df-db7c-491b-b7c4-a7c9fcf62f47_lach,l A x,l A N,,l A N,,l A N,
09eae7df-db7c-491b-b7c4-a7c9fcf62f47_strik,s t r I k,s t @ r I k,,s t @ r I k,,s t @ r I k,traag tempo


In [8]:
def checkIfValuesAreEqual(phonTrans1, phonTrans2):
    return 1 if (phonTrans1 == phonTrans2) else 0

# wieke's scores are taken as gold standard and compared to prompt
outputDF['correct_prompt_wieke'] = outputDF.apply(lambda row : checkIfValuesAreEqual(row['prompt_in_phonemes'], row['wieke']), axis = 1)
outputDF['correct_prompt_loes'] = outputDF.apply(
    lambda row: checkIfValuesAreEqual(row['prompt_in_phonemes'], row['loes']), axis=1)


In [9]:
print(cohen_kappa_score(outputDF['wieke'], outputDF['loes']))
print(matthews_corrcoef(outputDF['correct_prompt_wieke'], outputDF['correct_prompt_loes']))


0.5712881505873102
0.7150907421553531


In [10]:
outputDF.to_csv(
    '../../astla-data/dart-preposttest/slate-data/6-phonetic-transcriptions/phonetic-transcriptions-wieke-loes-carlos.csv')
