# Translate ASR Transcriptions from Chinese to English 

In [1]:
from collections import defaultdict
import os
import json
import queue
import time

from googletrans import Translator
import googletrans
import pandas as pd
import numpy as np

## Load translation queue

In [2]:
home_dir = os.path.expanduser('~')

In [3]:
output_dir = os.path.join(home_dir, 'Documents/datasets/charm/transformed/translations')

In [4]:
data = {}
data_dfs = {}
translation_files = []
for x in os.listdir(output_dir):
    if x.endswith('.json'):
        filepath = os.path.join(output_dir, x)
        translation_files.append(filepath)
        file_id = x.split('.')[0]
        with open(filepath, 'r', encoding='utf-8') as fp:
            data[file_id] = json.load(fp)
        data_dfs[file_id] = pd.DataFrame(data[file_id]['asr_turn_lvl'])

In [5]:
data_dfs['M01003R3K'].head()

Unnamed: 0,start_time,end_time,transcript,speaker_id,transcript_en
0,0.0,6.38,@reject@ 梅洛,No Speaker Found to Attribute!,@Reject@
1,6.44,8.99,大 的 探 班 日记,unknownspk00,Big Exploring Diary
2,10.16,11.81,@reject@ @reject@ 积极,No Speaker Found to Attribute!,@Reject @@ Reject@
3,12.53,15.56,参加 就是 在家 分 分 的 潜艇 电影,unknownspk00,Participating is a submarine movie at home
4,23.85,24.75,嗯,No Speaker Found to Attribute!,Um


In [6]:
len(data_dfs)

96

## Perform translation

In [7]:
print("Supported languages:", googletrans.LANGUAGES)

Supported languages: {'af': 'afrikaans', 'sq': 'albanian', 'am': 'amharic', 'ar': 'arabic', 'hy': 'armenian', 'az': 'azerbaijani', 'eu': 'basque', 'be': 'belarusian', 'bn': 'bengali', 'bs': 'bosnian', 'bg': 'bulgarian', 'ca': 'catalan', 'ceb': 'cebuano', 'ny': 'chichewa', 'zh-cn': 'chinese (simplified)', 'zh-tw': 'chinese (traditional)', 'co': 'corsican', 'hr': 'croatian', 'cs': 'czech', 'da': 'danish', 'nl': 'dutch', 'en': 'english', 'eo': 'esperanto', 'et': 'estonian', 'tl': 'filipino', 'fi': 'finnish', 'fr': 'french', 'fy': 'frisian', 'gl': 'galician', 'ka': 'georgian', 'de': 'german', 'el': 'greek', 'gu': 'gujarati', 'ht': 'haitian creole', 'ha': 'hausa', 'haw': 'hawaiian', 'iw': 'hebrew', 'he': 'hebrew', 'hi': 'hindi', 'hmn': 'hmong', 'hu': 'hungarian', 'is': 'icelandic', 'ig': 'igbo', 'id': 'indonesian', 'ga': 'irish', 'it': 'italian', 'ja': 'japanese', 'jw': 'javanese', 'kn': 'kannada', 'kk': 'kazakh', 'km': 'khmer', 'ko': 'korean', 'ku': 'kurdish (kurmanji)', 'ky': 'kyrgyz', 'l

In [8]:
class GoogleTranslator(object):
    def __init__(self, src='zh-cn', dest='en'):
        self.translator = Translator()
        self.src = src
        self.dest = dest
        
    def translate(self, input_texts):
        if not isinstance(input_texts,list):
            input_texts = [input_texts]
        
        target_texts = []
        for text in input_texts:
            translation = self.translator.translate(text, src=self.src, dest=self.dest)
            target_texts.append(translation.text)
        return target_texts

In [9]:
q = queue.Queue()
err_q = queue.Queue()

# put all work items in the queue
for f in data_dfs:
    q.put((f, data_dfs[f]))

# sentinel
q.put(None)

In [10]:
q.unfinished_tasks

97

In [11]:
translator = Translator()

In [12]:
completed = []
errors = [] # keep track of errors
error_count = 0 # if error count exceeds 100, stop the program and inspect
# get a file from the queue, translate utterance by utterance, checking if translation column has text in it or not
for job in iter(q.get, None): # iterate until we get the sentinel of None
    f, df = job
    start = time.time()
    print(f'Translating: {f}')
    for idx, row in df.iterrows():
        # if text already present in the transcript_en column, move on to the next row, already translated
        if isinstance(row['transcript_en'], str):
            continue    

        # otherwise, attempt to translate
        chinese = row['transcript']
        # remove extra spacing - unclear how this will affect performance
        chinese = chinese.replace(' ', '')
        try:
            translation = translator.translate(text=chinese, src='zh-cn', dest='en')
            df.loc[idx, 'transcript_en'] = translation.text
        except Exception as e:
            error = (f, idx, e)
            print(error)
            errors.append((f, idx, e))
            error_count += 1
            ## check on the program at this point, something might be majorly wrong
            if error_count > 1000:
                break
    completed.append(f)
    end = time.time()
    duration = end - start
    print(f"Time taken to translate {len(df)} utterances, with average length {df['transcript'].apply(lambda x: len(x)).mean():.2f}: {duration:.2f} seconds")
    print(f'Time per translation: {duration / len(df): .2f}')
    
    # write partial results to disk for each file so we don't lose progress
    asr_turn_lvl = df.to_dict(orient='records')
    
    # update the key in the data dict
    data[f]['asr_turn_lvl'] = asr_turn_lvl

    filepath = os.path.join(output_dir, f'{f}.json')
    with open(filepath, 'w', encoding='utf-8') as fh:
        json.dump(data[f], fh)

Translating: M01003R3K
Time taken to translate 100 utterances, with average length 40.17: 0.00 seconds
Time per translation:  0.00
Translating: M01003XJU
Time taken to translate 217 utterances, with average length 21.56: 0.00 seconds
Time per translation:  0.00
Translating: M01000545
Time taken to translate 295 utterances, with average length 19.36: 0.00 seconds
Time per translation:  0.00
Translating: M01000AJ9
Time taken to translate 78 utterances, with average length 10.27: 0.00 seconds
Time per translation:  0.00
Translating: M01003Y73
Time taken to translate 90 utterances, with average length 43.28: 0.00 seconds
Time per translation:  0.00
Translating: M0100054N
Time taken to translate 224 utterances, with average length 23.73: 0.00 seconds
Time per translation:  0.00
Translating: M0100053I
Time taken to translate 267 utterances, with average length 21.64: 0.00 seconds
Time per translation:  0.00
Translating: M01003LEP
Time taken to translate 60 utterances, with average length 13.

In [13]:
# estimated time to make all API calls
(21205 * 0.77) / 60 / 60

4.535513888888889

In [16]:
# save a sample conversation to csv for easy viewing
transcription_len = []
for f in data_dfs:
    transcription_len.append((f, len(data_dfs[f])))

In [19]:
len_df = pd.DataFrame(transcription_len, columns=['file_id', 'transcription_len'])

In [21]:
len_df['transcription_len'].describe()

count      96.000000
mean      220.885417
std       185.040187
min        28.000000
25%       100.750000
50%       162.500000
75%       279.250000
max      1104.000000
Name: transcription_len, dtype: float64

In [22]:
# find a median length conversation
len_df[(len_df['transcription_len'] > 150) & (len_df['transcription_len'] < 170)]

Unnamed: 0,file_id,transcription_len
8,M010040JE,159
44,M01003WEQ,164
63,M01003WXV,159
74,M010040QR,161
80,M01003Y07,151


In [24]:
# save sample: https://www.bilibili.com/video/BV1TK4y1D7M4/
sample_df = data_dfs['M010040QR']

In [26]:
sample_filepath = os.path.join(home_dir, 'Documents/datasets/charm/transformed/M010040QR_transcription.xlsx')
sample_df.to_excel(sample_filepath, index=False)