This is a script to read the manual annotations from the TextGrid (on chunk and attempts level) and convert them to a csv where each row represents a chunk.

In [1]:
import glob
import pandas as pd
import os
import numpy as np
import tgt # https://textgridtools.readthedocs.io/en/stable/api.html

In [5]:
# tg_file = '/vol/tensusers5/wharmsen/astla-data/dart-preposttest/specom-data/annotations_6sep/stephanie_v1/5fcccbb2-d4e2-4e75-ada9-be011f69c55c_checked.TextGrid'
# audio_filename = os.path.basename(tg_file).replace('_checked.TextGrid', '')

tg_file = '/vol/tensusers2/wharmsen/SERDA-annotations/round1_stories_all_marjul/textgrid/set1_jul/ZPGND-story_3-20230116114853113_checked.TextGrid'
basename = os.path.basename(tg_file).replace('_checked.TextGrid', '')

In [6]:
"""
This function reads a .tg file and saves it as a dataframe where each row represents one interval.
The columns are the following five properties: tier_name, tier_type, start_time, end_time, text
"""
def read_textgrid_to_dataframe(tg_file, audio_filename):
    
    # Read TextGrid file
    tg = tgt.io.read_textgrid(tg_file, encoding='utf-8', include_empty_intervals=False)

    # Convert TextGrid file to Formatted Table (= df with on each row one interval)
    table = tgt.io.export_to_table(tg, separator=', ')
    formatted_table = [x.split(', ') for x in table.split('\n')]

    tg_df = pd.DataFrame(formatted_table[1:], columns = formatted_table[0])

    # with open(textgrids_as_table_dir + audio_filename + '.csv', 'w') as f:
    #     f.write(table)

    return tg_df

tg_df = read_textgrid_to_dataframe(tg_file, basename)
tg_df.tail()


Unnamed: 0,tier_name,tier_type,start_time,end_time,text
1071,description,IntervalTier,121.4052373071388,121.69748687465528,cor
1072,description,IntervalTier,122.16222360626742,122.3856713692632,cor
1073,description,IntervalTier,122.3856713692632,122.83712705368328,cor
1074,description,IntervalTier,123.20193972796211,123.73304092008289,del
1075,description,IntervalTier,123.7740823452826,124.26201928932356,cor


In [7]:
def print_typos_annotations(tg_df):
    prompts = [x.split(' ')[0] for x in tg_df[tg_df['tier_name'] == 'prompts'].loc[:,'text']]
    chunks = list(tg_df[tg_df['tier_name'] == 'chunks'].loc[:,'text'])

    difference = list({p for p in prompts} - {c for c in chunks})

    if(len(difference) > 0):
        print(difference)

def print_comments(tg_df):
    try: 
        print(tg_df[tg_df['tier_name'] == 'comments']['text'])
    except:
        print( 'no comments')


In [8]:
def getPromptDF(basename):

    pathToPromptIdxs = '/vol/tensusers2/wharmsen/SERDA-data/prompts/'

    task = basename.split('-')[1]
    taskType = task.split('_')[0]
    taskNr = task.split('_')[1]

    promptFileName = task + '-wordIDX.csv'
    promptFile = os.path.join(pathToPromptIdxs, promptFileName)

    promptDF = pd.read_csv(promptFile)

    return promptDF

promptDF = getPromptDF(basename)


In [15]:
# Save relevant info from tier 1: prompts in chunks_df

def initialize_chunks_df(tg_df, filename):
    chunks_df = tg_df[tg_df['tier_name'] == 'chunks']
    chunks_df.loc[:,'name'] = [prompt.split(' ')[0] for prompt in list(chunks_df['text'])]
    # chunks_df = chunks_df.loc[:, ['name', 'text']]
    chunks_df = chunks_df.drop(['tier_name', 'tier_type'], axis=1)
    # chunks_df = chunks_df.set_axis([filename + '_' + prompt.split(' ')[0] for prompt in chunks_df['text']], axis='index')

    return chunks_df

chunks_df = initialize_chunks_df(tg_df, basename).reset_index(drop=True)
chunks_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunks_df.loc[:,'name'] = [prompt.split(' ')[0] for prompt in list(chunks_df['text'])]


Unnamed: 0,start_time,end_time,text,name
0,4.338767596881418,5.0069299784635675,als,als
1,5.083049743453939,5.5566838367273625,ouders,ouders
2,5.5566838367273625,6.131810949987948,zien,zien
3,6.145680218385061,6.407870520018562,dat,dat
4,6.437120228925004,6.614733013902537,je,je
...,...,...,...,...
167,121.4052373071388,121.69748687465527,je,je
168,122.16222360626742,122.3856713692632,aan,aan
169,122.3856713692632,122.83712705368328,het,het
170,123.20193972796213,123.73304092008289,spelen,spelen


In [16]:
# Loop through chunkDF
promptList = list(promptDF['prompt'])
expected_index = 0

for chunk_idx, row in chunks_df.iterrows():
    
    rec_prompt = row['name']
    start_time = row['start_time']
    end_time = row['end_time']
    # text = row['text']

    # match with first occurence in promptDF    
    idxOfRecWordInPromptDF = promptList[start_time, end_time].index(rec_prompt)

    if idxOfRecWordInPromptDF != -1 and idxOfRecWordInPromptDF >= expected_index :
        promptDF.loc[idxOfRecWordInPromptDF, 'start_time'] = start_time
        promptDF.loc[idxOfRecWordInPromptDF, 'end_time'] = end_time
        promptDF.loc[idxOfRecWordInPromptDF, 'chunk'] = rec_prompt

        promptList[idxOfRecWordInPromptDF] = ''
        print(promptList)

    else:
        print(basename, 'contains typo:', rec_prompt)


TypeError: list indices must be integers or slices, not tuple

In [17]:
# Add information from tier 2: chunks

def expand_chunks_df_with_chunks(tg_df, chunks_df, audio_filename):
    chunks_df['chunks'] = ''
    chunks_df['start_time'] = ''
    chunks_df['end_time'] = ''

    tg_df_chunks = tg_df[tg_df['tier_name'] == 'chunks']

    for idx, row in tg_df_chunks.iterrows():
        word = row['text']
        chunks_df.loc[audio_filename +'_'+ word, 'chunks'] = row['text']
        chunks_df.loc[audio_filename +'_'+ word, 'start_time'] = row['start_time']
        chunks_df.loc[audio_filename +'_'+ word, 'end_time'] = row['end_time']  

    # chunks_df['chunks'] = list(tg_df[tg_df['tier_name'] == 'chunks'].loc[:,'text'])
    # chunks_df['start_time'] = list(tg_df[tg_df['tier_name'] == 'chunks'].loc[:,'start_time'])
    # chunks_df['end_time'] = list(tg_df[tg_df['tier_name'] == 'chunks'].loc[:,'end_time'])

    return chunks_df

chunks_df = expand_chunks_df_with_chunks(tg_df, chunks_df, basename)
chunks_df

Unnamed: 0,start_time,end_time,text,name,chunks
0,,,als,als,
1,,,ouders,ouders,
2,,,zien,zien,
3,,,dat,dat,
4,,,je,je,
...,...,...,...,...,...
ZPGND-story_3-20230116114853113_gezellig,113.41270584562311,114.02376707504018,,,gezellig
ZPGND-story_3-20230116114853113_elkaar,114.49099169165794,116.09616745848489,,,elkaar
ZPGND-story_3-20230116114853113_praten,116.28769411248129,116.88051470818442,,,praten
ZPGND-story_3-20230116114853113_over,118.85962346614718,119.4250831112794,,,over


In [12]:
# Add information from the other tiers, that have attempt boundaries instead of chunk boundaries.
# Match 0, 1, 2 or more attempts with one chunk.

def add_attempts_info_to_chunks_df(tg_df, chunks_df):

    attempts_df = tg_df[tg_df['tier_name'].isin(['attempts', 'attemptsPhones', 'correct', 'description'])]

    resulting_matrix = []

    for idx, chunkInfo in chunks_df.iterrows():
        startTimeChunk = chunkInfo['start_time']
        endTimeChunk = chunkInfo['end_time']

        resulting_info = [startTimeChunk, endTimeChunk]

        for annotationType in ['attempts', 'attemptsPhones', 'correct', 'description']:

            # Get first and last attempt on certain chunk
            try:
                startAttemptIdx = attempts_df[(attempts_df['start_time'] == startTimeChunk) & (attempts_df['tier_name'] == annotationType)].index[0]
                endAttemptIdx = attempts_df[(attempts_df['end_time'] == endTimeChunk) & (attempts_df['tier_name'] == annotationType)].index[0]

                # Combine all attempts for one chunk
                attempts_to_chunks_list = []
                for idx in np.arange(startAttemptIdx, endAttemptIdx+1, 1):
                    attempts_to_chunks_list.append(attempts_df.loc[idx, 'text'])

                resulting_info.append("-".join(attempts_to_chunks_list))
            
            except:
                # prompt is not read, add default annotation
                resulting_info.append("")

            print(resulting_info)
            

        resulting_matrix.append(resulting_info)

    attempts_info_df = pd.DataFrame(resulting_matrix, columns = ['attemptsStart', 'attemptsEnd', 'graphTrans', 'phonTrans', 'assessment', 'assessmentDescription'])

    # Attempts_info_df and chunks_df should be matched on startTime
    attempts_info_df_time_index = attempts_info_df.rename(columns={"attemptsStart": "start_time"}).set_index('start_time')
    chunks_df_time_index = chunks_df.set_index('start_time')

    return chunks_df_time_index.join(attempts_info_df_time_index).reset_index().set_index('name')

attempts_info_df = add_attempts_info_to_chunks_df(tg_df, chunks_df)
attempts_info_df.head(20)

['', '', '']
['', '', '', '']
['', '', '', '', '']
['', '', '', '', '', '']
['', '', '']
['', '', '', '']
['', '', '', '', '']
['', '', '', '', '', '']
['', '', '']
['', '', '', '']
['', '', '', '', '']
['', '', '', '', '', '']
['', '', '']
['', '', '', '']
['', '', '', '', '']
['', '', '', '', '', '']
['', '', '']
['', '', '', '']
['', '', '', '', '']
['', '', '', '', '', '']
['', '', '']
['', '', '', '']
['', '', '', '', '']
['', '', '', '', '', '']
['', '', '']
['', '', '', '']
['', '', '', '', '']
['', '', '', '', '', '']
['', '', '']
['', '', '', '']
['', '', '', '', '']
['', '', '', '', '', '']
['', '', '']
['', '', '', '']
['', '', '', '', '']
['', '', '', '', '', '']
['', '', '']
['', '', '', '']
['', '', '', '', '']
['', '', '', '', '', '']
['', '', '']
['', '', '', '']
['', '', '', '', '']
['', '', '', '', '', '']
['', '', '']
['', '', '', '']
['', '', '', '', '']
['', '', '', '', '', '']
['', '', '']
['', '', '', '']
['', '', '', '', '']
['', '', '', '', '', '']
['', '', '']

Unnamed: 0_level_0,start_time,end_time,text,chunks,attemptsEnd,graphTrans,phonTrans,assessment,assessmentDescription
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
als,,,als,,,,,,
als,,,als,,,,,,
als,,,als,,,,,,
als,,,als,,,,,,
als,,,als,,,,,,
als,,,als,,,,,,
als,,,als,,,,,,
als,,,als,,,,,,
als,,,als,,,,,,
als,,,als,,,,,,


In [22]:
len(attempts_info_df.drop_duplicates())

102

In [9]:
# Main function
pd.set_option('mode.chained_assignment', None)

# For each TextGrid with manual annotations
for tg_file in textgrid_files:

    # Get audio file name
    tg_audio_name = os.path.basename(tg_file).replace('_checked.TextGrid', '.mp3')
    audio_filename = os.path.basename(tg_file).replace('_checked.TextGrid', '')

    try:
        tg_df = read_textgrid_to_dataframe(tg_file, audio_filename)
    except: 
        print('Corrupt TextGrid file')

    print(audio_filename)
    print_typos_annotations(tg_df)
    print_comments(tg_df)

    chunks_df = initialize_chunks_df(tg_df, audio_filename)
    chunks_df = expand_chunks_df_with_chunks(tg_df, chunks_df, audio_filename)
    chunks_df = add_attempts_info_to_chunks_df(tg_df, chunks_df)

    if len(chunks_df) == 24:
        chunks_df.to_csv(chunks_attempts_matched_dir + audio_filename + '.csv')
    else: 
        chunks_df.to_csv(chunks_attempts_not_matched_dir + audio_filename + '.csv')
    print('\n')
    

NameError: name 'textgrid_files' is not defined