In [134]:
import glob
import pandas as pd
import os
import numpy as np
import tgt # https://textgridtools.readthedocs.io/en/stable/api.html

In [135]:
tg_file = '/vol/tensusers2/wharmsen/SERDA-annotations/round1_stories_all_marjul/textgrid/set1_jul/ZQV5W-story_2-20221212100028130_checked.TextGrid'
basename = os.path.basename(tg_file).replace('_checked.TextGrid', '')

"""
This function reads a .tg file and saves it as a dataframe where each row represents one interval.
The columns are the following five properties: tier_name, tier_type, start_time, end_time, text
"""
def read_textgrid_to_dataframe(tg_file, audio_filename):
    
    # Read TextGrid file
    tg = tgt.io.read_textgrid(tg_file, encoding='utf-8', include_empty_intervals=False)

    # Convert TextGrid file to Formatted Table (= df with on each row one interval)
    table = tgt.io.export_to_table(tg, separator=', ')
    formatted_table = [x.split(', ') for x in table.split('\n')]

    tg_df = pd.DataFrame(formatted_table[1:], columns = formatted_table[0])

    # convert start_time and end_time from str to float
    convert_dict = {'start_time': float,
                    'end_time': float,
                    }
 
    tg_df = tg_df.astype(convert_dict)

    return tg_df

tg_df = read_textgrid_to_dataframe(tg_file, basename)
tg_df.head()


Unnamed: 0,tier_name,tier_type,start_time,end_time,text
0,prompts,IntervalTier,0.0,0.65,met /m E t/
1,prompts,IntervalTier,0.65,1.3,twee /t w e/
2,prompts,IntervalTier,1.3,1.95,doppen /d O p @/
3,prompts,IntervalTier,1.95,2.6,van /v A n/
4,prompts,IntervalTier,2.6,3.25,een /e n/


In [136]:
# Save relevant info from tier 1: prompts in chunks_df

def initialize_chunks_df(tg_df):
    chunks_df = tg_df[tg_df['tier_name'] == 'chunks']
    chunks_df = chunks_df.drop(['tier_name', 'tier_type'], axis=1)
    return chunks_df

chunks_df = initialize_chunks_df(tg_df).reset_index(drop=True)
chunks_df

Unnamed: 0,start_time,end_time,text
0,3.026477,3.362827,met
1,3.629143,4.432645,twee
2,4.848195,5.483522,doppen
3,5.600310,5.943667,van
4,5.970517,6.074458,de
...,...,...,...
159,112.965254,113.560035,loopt
160,113.560035,113.756239,door
161,113.756239,113.926750,de
162,113.982808,114.377552,klas


In [137]:
attemptsInfoDF = tg_df[tg_df['tier_name'].isin(['attempts', 'attemptsPhones','correct', 'description'])]

In [138]:
def getAttemptInfo(oneAttemptDF, tier_name, information_type):
    try:
        df = oneAttemptDF[oneAttemptDF['tier_name'] == tier_name].reset_index(drop=True)
        if len(df)==0:
            print(oneAttemptDF)
        return df.loc[0, information_type]
    except:
        return ''

In [139]:
def getCorrespondingAttempts(attemptsInfoDF, chunk_start, chunk_end):
     # Select corresponding intervals from attemptsInfoDF
    correspondingAttempts = attemptsInfoDF[(attemptsInfoDF['start_time']>=chunk_start) & (attemptsInfoDF['start_time']<chunk_end)]

    uniqueStarttimes = list(set(correspondingAttempts.loc[:,'start_time']))

    # Create attemptsList
    attemptsList = []
    for startTime in uniqueStarttimes:
        oneAttemptDF = correspondingAttempts[correspondingAttempts['start_time'] == startTime]

        attemptsList.append({
            'attempt_text': getAttemptInfo(oneAttemptDF, 'attempts', 'text'),
            'phones': getAttemptInfo(oneAttemptDF, 'attemptsPhones', 'text'),
            'correct': getAttemptInfo(oneAttemptDF, 'correct', 'text'),
            'description': getAttemptInfo(oneAttemptDF, 'description', 'text'),
            'attempt_start': startTime,
            'attempt_end': getAttemptInfo(oneAttemptDF, 'attempts', 'end_time'),
        })

    return attemptsList


In [140]:
def getPromptDF(basename):

    pathToPromptIdxs = '/vol/tensusers2/wharmsen/SERDA-data/prompts/'

    task = basename.split('-')[1]
    taskType = task.split('_')[0]
    taskNr = task.split('_')[1]

    promptFileName = task + '-wordIDX.csv'
    promptFile = os.path.join(pathToPromptIdxs, promptFileName)

    promptDF = pd.read_csv(promptFile)

    return promptDF

promptDF = getPromptDF(basename)
promptDF

Unnamed: 0,task_id,prompt_id,prompt
0,story_2,0-0-Jojo,jojo
1,story_2,1-0-Met,met
2,story_2,1-1-twee,twee
3,story_2,1-2-doppen,doppen
4,story_2,1-3-van,van
...,...,...,...
175,story_2,18-4-klas,klas
176,story_2,18-5-en,en
177,story_2,18-6-bekijkt,bekijkt
178,story_2,18-7-alle,alle


In [143]:
def getPromptID(promptDF, chunk_text):
    # Get index of first row
    firstRowIndex = promptDF.index[0]

    # get index of target word in promptlist
    try:
        target_index = promptDF[promptDF['prompt']==chunk_text].index[0]
    except:
        target_index = -1

    if target_index == firstRowIndex:
        return pd.DataFrame(), promptDF.loc[target_index], promptDF.loc[target_index+1:]
    elif target_index > firstRowIndex:
        return promptDF.loc[firstRowIndex:target_index-1], promptDF.loc[target_index], promptDF.loc[target_index+1:]
    else:
        print('something went wrong: \'', chunk_text, '\' not in promptDF after row ', firstRowIndex)

In [144]:
# Initialize variables
chunk_dict = {}
promptDF = getPromptDF(basename)

for idx, row in chunks_df.iterrows():

    # Chunk - info
    chunk_start = row['start_time']
    chunk_end = row['end_time']
    chunk_text = row['text']

    # List with for each attempt information
    attemptsList = getCorrespondingAttempts(attemptsInfoDF, chunk_start, chunk_end)

    # Get prompt_id that corresponds to chunk_text (idx_last_found_chunk is used to differentiate between multiple occurings of the same word)
    empty_df_rows, target_df_row, remaining_df_rows = getPromptID(promptDF, chunk_text)

    # Create empty dict values for prompts that do not occur in the chunksDF
    for idx, row in empty_df_rows.iterrows():
        # Add empty cases for missing prompts in the chunks
        prompt_id = row['prompt_id']
        chunk_dict[prompt_id] = {
            'chunk_text': '',
            'chunk_start': '',
            'chunk_end': '',
            'attempts': [],
        }

    # Create filled dict values for prompts that do occur in the chunksDF
    # Add filled case
    prompt_id = target_df_row['prompt_id']
    chunk_dict[prompt_id] = {
        'chunk_text': chunk_text,
        'chunk_start': chunk_start,
        'chunk_end': chunk_end,
        'attempts': attemptsList,
    }

    # Update promptDF
    promptDF = remaining_df_rows



something went wrong: ' fles ' not in promptDF after row  94


TypeError: cannot unpack non-iterable NoneType object

In [121]:
import json

with open('test2.json', 'w') as f:
    f.write(json.dumps(chunk_dict, indent=4, sort_keys=True))


In [21]:
print(len(chunk_dict.keys()))
print(len(promptDF))

172
172


In [144]:
print(set(promptDF['prompt_id']) - set(chunk_dict.keys()))


{'11-10-bent'}
