In [2]:
import json
import os
import sys

In [3]:
with open("CompleteSample1.jsonl", "r") as f:
    complete_data = list(f)

with open("InputsOnlySample1.jsonl", "r") as f:
    input_data = list(f)

In [5]:
def get_turn_text_translation_zh_en(message, debug=False):
    """
        Validates messages from the translation RESULT queue,
        only works for en (source) -> zh (target)

        :param message: json: 
        :param debug: bool:
        :returns: str:
    """
    if "translation" not in message:
        if debug:
            print("Warning: Translation message does not have a `translation` field")
        return None
    elif "asr_type" not in message:
        if debug:
            print("Warning: Translation message contains no `asr_type` field")
        return None
    elif message["asr_type"] != "TURN":
        if debug:
            print("Warning: Wrong message type", message['asr_type'])
    elif "source_language_code" not in message:
        if debug:
            print("Warning: Translation message does not have a `source_language` field.")
    elif message["source_language_code"] != "en":
        if debug:
            print("Warning: Looking at the wrong source language:", message['source_language_code'])
        return None
    elif "target_language_code" not in message:
        if debug:
            print("Warning: Translation message does not have a `target_language` field.")
    elif message['target_language_code'] != "zh":
        if debug:
            print("Warning: Looking at the wrong target language:", message['target_language_code'])
        return None
    
    turn_text = message['translation']
    
    if not type(turn_text) == str:
        if debug:
            print("Warning: Text field in translation message has type - ",
                  type(turn_text))
        return None
    if "@reject@" in turn_text:
        turn_text = turn_text.replace("@reject@", "")
    
    turn_text = turn_text.rstrip().lstrip()
    
    if turn_text == None or turn_text == '':
        if debug:
            print("Warning: Text field in ASR message is empty or None")
        return None

    return turn_text

In [6]:
## Complete data
count = 0

for json_str in complete_data[1:]:
    result = json.loads(json_str)
    if result['queue'] == 'RESULT':
        message = result['message']
        if 'type' in message and message['type'] == 'translation':
            text = get_turn_text_translation_zh_en(message, True)





In [7]:
def get_turn_text_asr_result_zh(message, debug=False):
    """Validates and returns incoming message if it passes checks."""
    if 'asr_text' not in message:
        if debug:
            print("Warning: ASR message contains no text field")
        return None
    elif 'asr_type' not in message:
        if debug:
            print("Warning: ASR message contains no type field")
        return None
    elif message['asr_type'] != "TURN":
        if debug:
            print("Wrong message type", message['asr_type'])
        return None
    elif 'asr_language_code' not in message:
        return None
    elif message['asr_language_code'] != 'zh':
        return None
    
    turn_text = message['asr_text']

    if not type(turn_text) == str:
        if debug:
            print("Warning: Text field in ASR message has type - ",
                  type(turn_text))
        return None
    if "@reject@" in turn_text:
        turn_text = turn_text.replace("@reject@", "")
    turn_text = turn_text.rstrip().lstrip()
    if turn_text == None or turn_text == '':
        if debug:
            print("Warning: Text field in ASR message is empty or None")
        return None

    return turn_text

Let's see the dialogue

In [8]:
text_asr = None
text_translation = None

for json_str in complete_data[1:]:
    result = json.loads(json_str)

    if result['queue'] == 'RESULT':
        message = result['message']
        if 'type' in message and message['type'] == 'asr_result':
            text_asr = get_turn_text_asr_result_zh(message, debug=False)

            if text_asr:
                print(f"From: {message['start_seconds']} to {message['end_seconds']}")
                print(f"Chinese: {text_asr}")
        if 'type' in message and message['type'] == 'translation':
            text_translation = get_turn_text_translation_zh_en(message, debug=False)

            if text_translation:
                print(f"From: {message['start_seconds']} to {message['end_seconds']}")
                print(f"English (translate): {text_translation}")


From: 142.08191990852356 to 145.35201907157898
English (translate): 我们需要谈谈你的团队处理垃圾的方式
From: 158.33351707458496 to 161.35685634613037
Chinese: 好吧,您好。
From: 169.746084690094 to 174.81492519378662
English (translate): 对不起我忘了我的礼貌。你好李先生。你今天好吗？
From: 188.55391645431519 to 192.9584197998047
Chinese: 我很好,谢谢,您怎么样?
From: 206.53500533103943 to 209.5392620563507
English (translate): 我的健康状况很好所以我不能抱怨。
From: 221.2496690750122 to 227.57230234146118
Chinese: I'm glad to hear that. Are you here to talk about garbage?
From: 242.73591995239258 to 249.49627494812012
English (translate): 是的你的团队又把垃圾丢在河边了。你得跟他们谈谈。
From: 265.8182649612427 to 270.25758361816406
Chinese: 你到我的家里来提出要求。
From: 291.1044890880585 to 297.13668990135193
Chinese: I can't believe you would do this. You came to my house to make a request.
From: 314.9832181930542 to 318.94423270225525
Chinese: 你到我的家里来提出要求。
From: 346.5063889026642 to 350.4387912750244
Chinese: 你真的到我的家里来提出要求?
From: 375.22095918655396 to 378.8630485534668
Chinese: 你到我的家里提出要求。


In [10]:
text_asr = None
text_translation = None

total_count = 0
acted_upon_asr_results = 0
acted_upon_tran_results = 0
asr_results=0
tran_results=0

for json_str in input_data:
    result = json.loads(json_str)
    total_count += 1

    if result['queue'] == 'RESULT':
        message = result['message']
        if 'type' in message and message['type'] == 'asr_result':
            text_asr = get_turn_text_asr_result_zh(message, debug=False)
            acted_upon_asr_results+= 1

            if text_asr:
                print(f"From: {message['start_seconds']} to {message['end_seconds']}")
                print(f"Chinese: {text_asr}")
                asr_results += 1

        elif 'type' in message and message['type'] == 'translation':
            text_translation = get_turn_text_translation_zh_en(message, debug=False)
            acted_upon_tran_results+= 1

            if text_translation:
                print(f"From: {message['start_seconds']} to {message['end_seconds']}")
                print(f"English (translate): {text_translation}")
                tran_results+= 1

From: 142.08191990852356 to 145.35201907157898
English (translate): 我们需要谈谈你的团队处理垃圾的方式
From: 158.33351707458496 to 161.35685634613037
Chinese: 好吧,您好。
From: 169.746084690094 to 174.81492519378662
English (translate): 对不起我忘了我的礼貌。你好李先生。你今天好吗？
From: 188.55391645431519 to 192.9584197998047
Chinese: 我很好,谢谢,您怎么样?
From: 206.53500533103943 to 209.5392620563507
English (translate): 我的健康状况很好所以我不能抱怨。
From: 221.2496690750122 to 227.57230234146118
Chinese: I'm glad to hear that. Are you here to talk about garbage?
From: 242.73591995239258 to 249.49627494812012
English (translate): 是的你的团队又把垃圾丢在河边了。你得跟他们谈谈。
From: 265.8182649612427 to 270.25758361816406
Chinese: 你到我的家里来提出要求。
From: 291.1044890880585 to 297.13668990135193
Chinese: I can't believe you would do this. You came to my house to make a request.
From: 314.9832181930542 to 318.94423270225525
Chinese: 你到我的家里来提出要求。
From: 346.5063889026642 to 350.4387912750244
Chinese: 你真的到我的家里来提出要求?
From: 375.22095918655396 to 378.8630485534668
Chinese: 你到我的家里提出要求。


In [11]:
print(f"Total lines: {total_count}")
print(f"ASR Matches: {acted_upon_asr_results}")
print(f"Actual valid ASR Results: {asr_results}")
print(f"Translation Matches: {acted_upon_tran_results}")
print(f"Actual valid Translation Results: {tran_results}")

Total lines: 8314
ASR Matches: 28
Actual valid ASR Results: 15
Translation Matches: 28
Actual valid Translation Results: 13


In [None]:
with open("../transcripts/")