# Prepare ASR Transcription Queue from Chinese to English
# WARNING! Running this script may overwrite the translation directory

In [1]:
from collections import defaultdict
import os
import json
import queue

from googletrans import Translator
import googletrans
import pandas as pd
import numpy as np

## Load transcripts

In [2]:
home_dir = os.path.expanduser('~')

In [3]:
asr_dirs = [os.path.join(home_dir, 'Documents/datasets/charm/transformed/R2/ldc-r2-batch1-tom-n79'),
os.path.join(home_dir, 'Documents/datasets/charm/transformed/R1/audio_processed'),
os.path.join(home_dir, 'Documents/datasets/charm/transformed/R1/video_processed')]

In [4]:
# create lists of all filepaths and file_ids
asr_files = []
file_ids = []
files_by_dir = defaultdict(list)
dir_by_file = {}
for dir_ in asr_dirs:
    for f in os.listdir(dir_):
        if f.endswith('.json'):
            filepath = os.path.join(dir_, f)
            asr_files.append(filepath)
            file_ids.append(f.split('_')[0])
            group = os.path.join(*filepath.split(os.sep)[-3:-1])
            files_by_dir[group].append(filepath)
            dir_by_file[os.path.split(filepath)[-1]] = group

In [5]:
# load json files
raw_data = {}
data_dfs = {}
for f in asr_files:
    filename = os.path.split(f)[-1]
    with open(f, 'r') as fh:
        raw_data[filename] = json.load(fh)
        if 'asr_turn_lvl' in raw_data[filename]:
            data_dfs[filename] = pd.DataFrame(raw_data[filename]['asr_turn_lvl'])
        else:
            data_dfs[filename] = pd.DataFrame(raw_data[filename]['asr_preprocessed_turn_lvl'])

In [6]:
len(data_dfs)

245

## Load LDC annotations

In [7]:
# source: https://drive.google.com/drive/folders/1aL7bcLWQmUskR3dmj3K1jdXQsb_nIcv2
anno_dir = os.path.join(home_dir, 'Documents/datasets/charm/raw/LDC2022E18_CCU_TA1_Mandarin_Chinese_Development_Annotation_V1.0/data')
anno_files = [os.path.join(anno_dir, x) for x in os.listdir(anno_dir) if x not in ['.DS_Store']]

In [8]:
anno_dfs = {}
for f in anno_files:
    filename = os.path.split(f)[-1]
    anno_dfs[filename] = pd.read_csv(f, sep='\t')

In [9]:
anno_files = {}
anno_files_list = set()
for f in anno_dfs:
    temp_files = anno_dfs[f]['file_id'].unique()
    anno_files[f] = temp_files
    anno_files_list = anno_files_list.union(set(temp_files))
anno_files_list = sorted(list(anno_files_list))

In [10]:
# which files do we have transcriptions for?
ldc_intersection = set(anno_files_list).intersection(set(file_ids))

In [11]:
len(ldc_intersection)

96

## Quantify number of translations and characters per translation

In [12]:
num_calls_per_trans = []
num_chars_per_utter = []
for f in ldc_intersection:
    temp_df = data_dfs[f'{f}_processed_results.json']
    num_calls_per_trans.append(len(temp_df))
    chars_per_utter = temp_df['transcript'].apply(lambda x: len(x)).values.tolist()
    num_chars_per_utter.extend(chars_per_utter)

In [13]:
# stats on API calls, including number of API calls (count), average and max number of utterances (mean, max) 
pd.DataFrame(num_calls_per_trans, columns=['API Calls']).describe()

Unnamed: 0,API Calls
count,96.0
mean,220.885417
std,185.040187
min,28.0
25%,100.75
50%,162.5
75%,279.25
max,1104.0


In [14]:
# stats on utterances, including number of utterances (count), average and max number of characters (mean, max) 
pd.DataFrame(num_chars_per_utter, columns=['Utterances']).describe()

Unnamed: 0,Utterances
count,21205.0
mean,27.635416
std,49.113722
min,1.0
25%,11.0
50%,18.0
75%,30.0
max,2244.0


In [15]:
# total number of chars translated
sum(num_chars_per_utter)

586009

## Develop a process for keeping track of successes/failures
- we can have partial success on the list of files
- each file can have partial success on the utterances
- work queue should be a list of files
- where each element in the queue is a DF containing all utterances and translated column
- can then reprocess all results idempotently, by checking if the translated column is null or not
- using a queue also sets us up to use threads in the future

In [16]:
output_dir = os.path.join(home_dir, 'Documents/datasets/charm/transformed/translations')
os.makedirs(output_dir, exist_ok=True)

In [17]:
# write initial queue to output directory, then all future jobs will read from this directory to push toward completion
# warning, only do this once, otherwise work will be overwritten
overwrite_cache = False
if overwrite_cache:

    # add in placeholder transcript_en column into all DFs
    initial_queue = {}
    for f in data_dfs:
        # only queue up files that we have labels for
        if f.split('_')[0] not in ldc_intersection:
            continue
        data_dfs[f]['transcript_en'] = np.NaN
        asr_turn_lvl = data_dfs[f].to_dict(orient='records')
        # copy the data over
        initial_queue[f] = {**raw_data[f]} 
        # standardize this key
        if 'asr_turn_lvl' in initial_queue[f]:
            initial_queue[f]['asr_turn_lvl'] = asr_turn_lvl
        else:
            # delete 'asr_preprocessed_turn_lvl' and make it 'asr_turn_lvl'
            initial_queue[f].pop('asr_preprocessed_turn_lvl')
            initial_queue[f]['asr_turn_lvl'] = asr_turn_lvl

        # write this initial queue to disk
        file_id = f.split('_')[0] + '.json' # just use file_id.json as the filename
        filepath = os.path.join(output_dir, file_id)
        with open(filepath, 'w', encoding='utf-8') as fh:
            json.dump(initial_queue[f], fh)

## Reload saved data and verify correctness

In [18]:
if overwrite_cache:
    queue_check = {}
    translation_files = []
    for x in os.listdir(output_dir):
        if x.endswith('.json'):
            filepath = os.path.join(output_dir, x)
            translation_files.append(filepath)
            file_id = x.split('.')[0]
            with open(filepath, 'r', encoding='utf-8') as fp:
                queue_check[file_id] = json.load(fp)

            # check that the dict is equivalent to the original dict
            # this got nightmarishly complex due to the presence of the "transcript_en" key
            raw_data_key = f'{file_id}_processed_results.json'
            for key in raw_data[raw_data_key]:
                if key == 'asr_preprocessed_turn_lvl':
                    for idx, element in enumerate(raw_data[raw_data_key][key]):
                        for subkey in element:
                            assert element[subkey] == queue_check[file_id]['asr_turn_lvl'][idx][subkey]
                elif key == 'asr_turn_lvl':
                    for idx, element in enumerate(raw_data[raw_data_key][key]):
                        for subkey in element:
                            assert element[subkey] == queue_check[file_id][key][idx][subkey]
                else:
                    assert raw_data[raw_data_key][key] == queue_check[file_id][key]