This is a script to extract word-level start times, end times and duration times from CTC forced alignment using Wav2Vec.

Results can be found in 
`./SERDA-experiment-data/' + r +'/stories/ctc_kurz_w2v_fc/csv-scores'`

In [51]:
import pandas as pd
import os
import glob
import json

Helper functions

In [54]:
def readJSON(jsonFile):
    with open(jsonFile, 'r') as f:
        data = json.load(f)
    return data

def getSpeakerIDs(round):
    if(round == 'round1'):
        df =  pd.read_csv('/vol/tensusers2/wharmsen/SERDA-data/prompts/round1_speakerIDs.csv')
        return list(df['round1_speaker_ids'])
    elif(round == 'round2'):
        df =  pd.read_csv('/vol/tensusers2/wharmsen/SERDA-data/prompts/round2_speakerIDs.csv')
        return list(df['round2_speaker_ids'])
    
def getPromptDF(task):

    pathToPromptIdxs = '/vol/tensusers2/wharmsen/SERDA-data/prompts/'

    promptFileName = task + '-wordIDX.csv'
    promptFile = os.path.join(pathToPromptIdxs, promptFileName)

    promptDF = pd.read_csv(promptFile)

    return promptDF

"""
This function adds empty dataframes (col=word_ids, row=students) as values to the storyInfoDict
"""
def initializestoryInfoDict(storyInfoDict, uniqueStudents, word_ids, taskStr):

    storyInfoDict['storyCTCAsrStartSpeakDF'.replace('story', taskStr)] = pd.DataFrame(index = uniqueStudents, columns = word_ids)
    storyInfoDict['storyCTCAsrStopSpeakDF'.replace('story', taskStr)] = pd.DataFrame(index = uniqueStudents, columns = word_ids)
    storyInfoDict['storyCTCAsrTimeDF'.replace('story', taskStr)] = pd.DataFrame(index = uniqueStudents, columns = word_ids)

    return storyInfoDict

In [58]:
# Change this variable depending on the round of data you want to process.
# This is the only variable that needs to be changed manually.
r = 'round2'

asr_json_dir = '/vol/tensusers2/wharmsen/SERDA-experiment-data/' + r +'/stories/ctc_kurz_w2v_fc/json_align'
task_type = 'stories'
output_dir = '/vol/tensusers2/wharmsen/SERDA-experiment-data/' + r +'/stories/ctc_kurz_w2v_fc/csv-scores'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)


fileList = glob.glob(os.path.join(asr_json_dir, '*.json'))

if(task_type == 'stories'):

    prompt_ids_story1 = getPromptDF('story_1')['prompt_id']
    prompt_ids_story2 = getPromptDF('story_2')['prompt_id']
    prompt_ids_story3 = getPromptDF('story_3')['prompt_id']

    # Initialize output DFs (studentIDs x promptIDs)
    infoDict = {}
    infoDict = initializestoryInfoDict(infoDict, getSpeakerIDs(round), prompt_ids_story1, 'story1')
    infoDict = initializestoryInfoDict(infoDict, getSpeakerIDs(round), prompt_ids_story2, 'story2')
    infoDict = initializestoryInfoDict(infoDict, getSpeakerIDs(round), prompt_ids_story3, 'story3')

    # Fill outputDFs
    for file in fileList:

        # Extract data from JSON
        startTimes = [result['start'] for result in readJSON(file)]
        endTimes = [result['end'] for result in readJSON(file)]
        durations = [xi - yi for xi, yi in zip(endTimes, startTimes)]

        studentID, taskID, date = os.path.basename(file).split('-')

        # Save data in proper outputDF
        stopSpeakKey = 'storyCTCAsrStopSpeakDF'.replace('story', taskID).replace('_', '')
        infoDict[stopSpeakKey].loc[studentID, :] = endTimes

        startSpeakKey = 'storyCTCAsrStartSpeakDF'.replace('story', taskID).replace('_', '')
        infoDict[startSpeakKey].loc[studentID, :] = startTimes

        timeOutputKey = 'storyCTCAsrTimeDF'.replace('story', taskID).replace('_', '')
        infoDict[timeOutputKey].loc[studentID, :] = durations
        

# Export the output DFs
"""
Export word level information, 5 dataframes for each task
"""
for key in infoDict.keys():
    infoDict[key].to_csv(os.path.join(output_dir, key + '.tsv'), sep='\t')