In [163]:
import os
import re
import json
from deep_translator import GoogleTranslator
#from deep_translator.exceptions import NotValidPayload

In [164]:
LANGUAGE = 'en'

In [165]:
data_path = '../data/2023-04-12_oasst_ready.trees.jsonl'
with open(data_path, 'r') as f:
    data = [json.loads(line) for line in f]

In [166]:
for tree in data:
    if tree['prompt']['lang'] == LANGUAGE:
        break
tree

{'message_tree_id': '6ab24d72-0181-4594-a9cd-deaf170242fb',
 'tree_state': 'ready_for_export',
 'prompt': {'message_id': '6ab24d72-0181-4594-a9cd-deaf170242fb',
  'user_id': 'c3fe8c76-fc30-4fa7-b7f8-c492f5967d18',
  'created_date': '2023-02-05T14:23:50.983374+00:00',
  'text': 'Can you write a short introduction about the relevance of the term "monopsony" in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research.',
  'role': 'prompter',
  'lang': 'en',
  'review_count': 3,
  'review_result': True,
  'deleted': False,
  'synthetic': False,
  'emojis': {'+1': 10, '_skip_reply': 1, '_skip_ranking': 4},
  'replies': [{'message_id': 'c8e83833-ecbc-44fe-b6db-735228c25a1c',
    'parent_id': '6ab24d72-0181-4594-a9cd-deaf170242fb',
    'user_id': '2c96e467-66f0-4be7-9693-bda51356a424',
    'created_date': '2023-02-06T13:50:44.657083+00:00',
    'text': '"Monopsony" refers to a market structure where there is only one buyer for a particula

### Translate the data

In [167]:
translator = GoogleTranslator(source=LANGUAGE, target='sl')
#translator = PonsTranslator(source='en', target='sl')
#translator = DeeplTranslator(source='en', target='sl')
#translator = MyMemoryTranslator(source='en', target='sl')

def extract_code_block(text):
    code = re.findall(r'```[a-z]*\n([\s\S]*?)\n```', text, re.MULTILINE)
    return code if len(code) > 0 else []


def separate_text_and_code(text):
    code_blocks = extract_code_block(text)
    for code in code_blocks:
        text = text.replace(code, 'XXX')
    return text, code_blocks


def _translate(text):
    # extract code blocks
    text, code_blocks = separate_text_and_code(text)
    # do not translate numbers
    if text.isdigit():
        translation = text
    else:
        translation = translator.translate(text, from_language=LANGUAGE)
    # place the code blocks back into the translation
    for code in code_blocks:
        translation = translation.replace('XXX', code, 1)
    return translation


def translate(text):
    if len(text) < 5000: 
        return _translate(text)

    lines = text.split('\n')
    translations = []
    for line in lines:
        translations.append(_translate(line))
    return '\n'.join([t for t in translations if t is not None])

In [168]:
def translate_message(message, translate):
    text = message['text']
    translation = translate(text)
    if translation is None or len(translation) == 0:
        print('Translation failed for:', text)
        message['translation'] = text
        return message
    message['translation'] = translation
    for reply in message['replies']:
        reply = translate_message(reply, translate)
    return message
    

In [169]:
def translate_tree(tree, translate):
    tree['prompt'] = translate_message(tree['prompt'], translate)
    return tree

In [170]:
def translate_dataset(data, translate, translations_path='data/translated'):
    #os.mkdir(translations_path)
    failed_ids = []
    for tree in data:
        if tree['prompt']['lang'] != LANGUAGE:
            continue
        # catch errors at this stage so as not to stop translation of other trees
        tree_path = os.path.join(translations_path, tree['message_tree_id'] + '.json')
        if os.path.exists(tree_path):
            continue
        try:
            translated = translate_tree(tree, translate)
            # save to file
            with open(tree_path, 'w', encoding='utf-8') as f:
                json.dump(translated, f, ensure_ascii=False)
        except Exception as e:
            print(e)
            print('Translation failed for tree: ' + tree['message_tree_id'])
            failed_ids.append(tree['message_tree_id'])
            continue
    return failed_ids

In [171]:
failed_ids = translate_dataset(data[:5182], translate, translations_path='../data/google_translate')

In [172]:
failed_ids

[]

## Check the transalted conversation

In [173]:
def generate_conversation(tree):
    conversation = []
    message = tree['prompt']
    conversation.append((message['role'], message['translation']))
    while len(message['replies']) > 0:
        message = message['replies'][0]
        conversation.append((message['role'], message['translation']))
    return conversation

In [174]:
tree_id = '5249c721-e6d7-45f3-9680-f211d7be308e'
#tree_id = data[1236]['message_tree_id']

with open('../data/google_translate/' + tree_id + '.json', 'r') as f:
    translated = json.load(f)

In [175]:
conv = generate_conversation(translated)
for role, line in conv:
    print(f'{role.upper()}: {line}')

PROMPTER: Katere so stopnje razvoja in iz česa so sestavljene po Piagetu?
ASSISTANT: Jean Piaget je bil švicarski psiholog, ki je predlagal teorijo človeškega kognitivnega razvoja, ki je sestavljena iz štirih stopenj:

Senzomotorična stopnja (0-2 leti): V tej fazi se otrok uči s svojimi čuti in gibi. Odkrije, da lahko njegova dejanja vplivajo na okolje, in začne oblikovati osnovno idejo o objektivnosti in kontinuiteti.

Predoperativna stopnja (2-7 let): V tej fazi začne otrok razvijati simbolno mišljenje in razumeti, da stvari lahko predstavljajo druge stvari. Prav tako začnejo razvijati logično razmišljanje in razumejo koncepte, kot sta vzrok in posledica.

Faza konkretnih operacij (7-12 let): V tej fazi otrok razvije logično razmišljanje in razume vzročne zveze. Začnejo razumeti, da imajo stvari lahko več perspektiv in da so koncepti lahko bolj zapleteni, kot se zdijo na prvi pogled.

Stopnja formalnih operacij (12 let in več): Na tej stopnji posameznik razvije abstraktno mišljenje i