In [1]:
import glob
import pandas as pd
import os
import numpy as np
import tgt # https://textgridtools.readthedocs.io/en/stable/api.html

tg_file = '/vol/tensusers2/wharmsen/SERDA-annotations/round1_stories_all_marjul/textgrid/set1_jul/ZPGND-story_3-20230116114853113_checked.TextGrid'
basename = os.path.basename(tg_file).replace('_checked.TextGrid', '')

"""
This function reads a .tg file and saves it as a dataframe where each row represents one interval.
The columns are the following five properties: tier_name, tier_type, start_time, end_time, text
"""
def read_textgrid_to_dataframe(tg_file, audio_filename):
    
    # Read TextGrid file
    tg = tgt.io.read_textgrid(tg_file, encoding='utf-8', include_empty_intervals=False)

    # Convert TextGrid file to Formatted Table (= df with on each row one interval)
    table = tgt.io.export_to_table(tg, separator=', ')
    formatted_table = [x.split(', ') for x in table.split('\n')]

    tg_df = pd.DataFrame(formatted_table[1:], columns = formatted_table[0])

    # convert start_time and end_time from str to float
    convert_dict = {'start_time': float,
                    'end_time': float,
                    }
 
    tg_df = tg_df.astype(convert_dict)

    return tg_df

tg_df = read_textgrid_to_dataframe(tg_file, basename)
tg_df.head()


Unnamed: 0,tier_name,tier_type,start_time,end_time,text
0,prompts,IntervalTier,0.0,0.73,als /A l s/
1,prompts,IntervalTier,0.73,1.46,ouders /AU d @ r s/
2,prompts,IntervalTier,1.46,2.19,zien /z i n/
3,prompts,IntervalTier,2.19,2.92,dat /d A t/
4,prompts,IntervalTier,2.92,3.65,je /j @/


In [2]:
# Save relevant info from tier 1: prompts in chunks_df

def initialize_chunks_df(tg_df):
    chunks_df = tg_df[tg_df['tier_name'] == 'chunks']
    chunks_df = chunks_df.drop(['tier_name', 'tier_type'], axis=1)
    return chunks_df

chunks_df = initialize_chunks_df(tg_df).reset_index(drop=True)
chunks_df

Unnamed: 0,start_time,end_time,text
0,4.338768,5.006930,als
1,5.083050,5.556684,ouders
2,5.556684,6.131811,zien
3,6.145680,6.407871,dat
4,6.437120,6.614733,je
...,...,...,...
167,121.405237,121.697487,je
168,122.162224,122.385671,aan
169,122.385671,122.837127,het
170,123.201940,123.733041,spelen


In [3]:
attemptsInfoDF = tg_df[tg_df['tier_name'].isin(['attempts', 'attemptsPhones','correct', 'description'])]

In [4]:
def getAttemptInfo(oneAttemptDF, tier_name, information_type):
    try:
        df = oneAttemptDF[oneAttemptDF['tier_name'] == tier_name].reset_index(drop=True)
        if len(df)==0:
            print(oneAttemptDF)
        return df.loc[0, information_type]
    except:
        return ''

In [5]:
def getCorrespondingAttempts(attemptsInfoDF, chunk_start, chunk_end):
     # Select corresponding intervals from attemptsInfoDF
    correspondingAttempts = attemptsInfoDF[(attemptsInfoDF['start_time']>=chunk_start) & (attemptsInfoDF['start_time']<chunk_end)]

    uniqueStarttimes = list(set(correspondingAttempts.loc[:,'start_time']))

    # Create attemptsList
    attemptsList = []
    for startTime in uniqueStarttimes:
        oneAttemptDF = correspondingAttempts[correspondingAttempts['start_time'] == startTime]

        attemptsList.append({
            'attempt_text': getAttemptInfo(oneAttemptDF, 'attempts', 'text'),
            'phones': getAttemptInfo(oneAttemptDF, 'attemptsPhones', 'text'),
            'correct': getAttemptInfo(oneAttemptDF, 'correct', 'text'),
            'description': getAttemptInfo(oneAttemptDF, 'description', 'text'),
            'attempt_start': startTime,
            'attempt_end': getAttemptInfo(oneAttemptDF, 'attempts', 'end_time'),
        })

    return attemptsList


In [6]:
def getPromptDF(basename):

    pathToPromptIdxs = '/vol/tensusers2/wharmsen/SERDA-data/prompts/'

    task = basename.split('-')[1]
    taskType = task.split('_')[0]
    taskNr = task.split('_')[1]

    promptFileName = task + '-wordIDX.csv'
    promptFile = os.path.join(pathToPromptIdxs, promptFileName)

    promptDF = pd.read_csv(promptFile)

    return promptDF

promptDF = getPromptDF(basename)
promptDF

Unnamed: 0,task_id,prompt_id,prompt
0,story_3,0-0-Verslaafd,verslaafd
1,story_3,1-0-Als,als
2,story_3,1-1-ouders,ouders
3,story_3,1-2-zien,zien
4,story_3,1-3-dat,dat
...,...,...,...
167,story_3,11-6-je,je
168,story_3,11-7-aan,aan
169,story_3,11-8-het,het
170,story_3,11-9-spelen,spelen


In [20]:
def getPromptID(basename, chunk, idx_last_found_chunk):
    
    promptDF = getPromptDF(basename)
    promptList = list(promptDF['prompt'])
    
    #Find idx of chunk in sublist
    promptSubList = promptList[idx_last_found_chunk+1:]
    try:
        idx_of_chunk_in_sublist = promptSubList.index(chunk)
    except:
        print('chunk is not found')
        idx_of_chunk_in_sublist =  0
        
    idx_last_found_chunk = idx_of_chunk_in_sublist + idx_last_found_chunk +1

    return idx_last_found_chunk


In [21]:
# Initialize variables
chunk_dict = {}
idx_last_found_chunk = -1

for idx, row in chunks_df.iterrows():

    # Chunk - info
    chunk_start = row['start_time']
    chunk_end = row['end_time']
    chunk_text = row['text']

    # List with for each attempt information
    attemptsList = getCorrespondingAttempts(attemptsInfoDF, chunk_start, chunk_end)

    # Get prompt_id that corresponds to chunk_text (idx_last_found_chunk is used to differentiate between multiple occurings of the same word)
    idx_chunk_in_promptslist = getPromptID(basename, chunk_text, idx_last_found_chunk)

    print(idx_last_found_chunk, idx_chunk_in_promptslist, idx_chunk_in_promptslist - idx_last_found_chunk)

    idx_difference = idx_chunk_in_promptslist - idx_last_found_chunk

    while (idx_chunk_in_promptslist - idx_last_found_chunk) > 1:
        # Add empty cases for missing prompts in the chunks
        prompt_id = promptDF.loc[idx_last_found_chunk+1, 'prompt_id']
        chunk_dict[prompt_id] = {
            'chunk_text': '',
            'chunk_start': '',
            'chunk_end': '',
            'attempts': [],
        }

        idx_last_found_chunk+=1
        print(prompt_id)

    if (idx_chunk_in_promptslist - idx_last_found_chunk) == 1:
        # Add filled case
        prompt_id = promptDF.loc[idx_last_found_chunk+1, 'prompt_id']
        chunk_dict[prompt_id] = {
            'chunk_text': chunk_text,
            'chunk_start': chunk_start,
            'chunk_end': chunk_end,
            'attempts': attemptsList,
        }
        print(prompt_id)
    
    if (idx_chunk_in_promptslist - idx_last_found_chunk) < 1:
        print('something is wrong')

    # Update start_idx
    idx_last_found_chunk = idx_last_found_chunk+1



    # while (idx_chunk_in_promptslist - idx_last_found_chunk) >= 0:

    #     if (idx_chunk_in_promptslist - idx_last_found_chunk) == 1:
    #         # Update idx_last_found_chunk
    #         idx_last_found_chunk+=1

    #         # Save chunk and corresponding attempts in chunk_dict
    #         prompt_id = promptDF.loc[idx_last_found_chunk+1, 'prompt_id']
    #         chunk_dict[prompt_id] = {
    #             'chunk_text': chunk_text,
    #             'chunk_start': chunk_start,
    #             'chunk_end': chunk_end,
    #             'attempts': attemptsList,
    #         }
    #         break
        
    #     elif (idx_chunk_in_promptslist - idx_last_found_chunk) > 1:
    #         # Update idx_last_found_chunk
    #         idx_last_found_chunk+=1

    #         prompt_id = promptDF.loc[idx_last_found_chunk+1, 'prompt_id']
    #         chunk_dict[prompt_id] = {
    #             'chunk_text': '',
    #             'chunk_start': '',
    #             'chunk_end': '',
    #             'attempts': [],
    #         }
        
    #     else:
    #         print('ERROR Chunk is non-existing prompt: ', chunk_text)



-1 1 2
0-0-Verslaafd
1-0-Als
1 2 1
1-1-ouders
2 3 1
1-2-zien
3 4 1
1-3-dat
4 5 1
1-4-je
5 6 1
1-5-veel
6 7 1
1-6-aan
7 8 1
1-7-het
8 9 1
1-8-gamen
9 10 1
1-9-bent
10 11 1
1-10-denken
11 12 1
1-11-ze
12 13 1
1-12-misschien
13 14 1
1-13-dat
14 15 1
1-14-hun
15 16 1
1-15-kind
16 17 1
1-16-verslaafd
17 18 1
1-17-is
18 19 1
1-18-aan
19 20 1
1-19-gamen
          tier_name     tier_type  start_time   end_time  text
363        attempts  IntervalTier   17.408978  17.581892   jij
547  attemptsPhones  IntervalTier   17.408978  17.581892  j EI
          tier_name     tier_type  start_time   end_time  text
363        attempts  IntervalTier   17.408978  17.581892   jij
547  attemptsPhones  IntervalTier   17.408978  17.581892  j EI
20 21 1
2-0-Jij
21 22 1
2-1-kunt
22 23 1
2-2-ze
23 24 1
2-3-vast
24 25 1
2-4-wel
25 26 1
2-5-geruststellen
26 27 1
2-6-want
27 28 1
2-7-er
28 29 1
2-8-zijn
29 30 1
2-9-maar
30 31 1
2-10-heel
31 32 1
2-11-weinig
32 33 1
2-12-kinderen
33 34 1
2-13-op
34 35 1
2-14-de
35 36 1


KeyError: 172

In [22]:
import json

with open('test.json', 'w') as f:
    f.write(json.dumps(chunk_dict, indent=4, sort_keys=True))


In [23]:
print(len(chunk_dict.keys()))
print(len(promptDF))

172
172


In [144]:
print(set(promptDF['prompt_id']) - set(chunk_dict.keys()))


{'11-10-bent'}
