In [68]:
import os
import re
import json
from deep_translator import GoogleTranslator
#from deep_translator.exceptions import NotValidPayload

In [8]:
data_path = '../data/2023-04-12_oasst_ready.trees.jsonl'
with open(data_path, 'r') as f:
    data = [json.loads(line) for line in f]

### Translate the data

In [122]:
translator = GoogleTranslator(source='en', target='sl')
#translator = PonsTranslator(source='en', target='sl')
#translator = DeeplTranslator(source='en', target='sl')
#translator = MyMemoryTranslator(source='en', target='sl')

def extract_code_block(text):
    code = re.findall(r'```[a-z]*\n([\s\S]*?)\n```', text, re.MULTILINE)
    return code if len(code) > 0 else []

def separate_text_and_code(text):
    code_blocks = extract_code_block(text)
    for code in code_blocks:
        text = text.replace(code, 'XXX')
    return text, code_blocks

def _translate(text):
    # extract code blocks
    text, code_blocks = separate_text_and_code(text)
    # do not translate numbers
    if text.isdigit():
        translation = text
    else:
        translation = translator.translate(text, from_language='sl')
    # place the code blocks back into the translation
    for code in code_blocks:
        translation = translation.replace('XXX', code, 1)
    return translation


def translate(text):
    if len(text) < 5000: 
        return _translate(text)

    lines = text.split('\n')
    translations = []
    for line in lines:
        translations.append(_translate(line))
    return '\n'.join([t for t in translations if t is not None])

In [123]:
def translate_message(message, translate):
    text = message['text']
    translation = translate(text)
    if translation is None or len(translation) == 0:
        print('Translation failed for:', text)
        message['translation'] = text
        return message
    message['translation'] = translation
    for reply in message['replies']:
        reply = translate_message(reply, translate)
    return message
    

In [124]:
def translate_tree(tree, translate):
    tree['prompt'] = translate_message(tree['prompt'], translate)
    return tree

In [125]:
def translate_dataset(data, translate, translations_path='data/translated'):
    #os.mkdir(translations_path)
    failed_ids = []
    for tree in data:
        if tree['prompt']['lang'] != 'en':
            continue
        # catch errors at this stage so as not to stop translation of other trees
        tree_path = os.path.join(translations_path, tree['message_tree_id'] + '.json')
        if os.path.exists(tree_path):
            continue
        try:
            translated = translate_tree(tree, translate)
            # save to file
            with open(tree_path, 'w', encoding='utf-8') as f:
                json.dump(translated, f, ensure_ascii=False)
        except Exception as e:
            print(e)
            print('Translation failed for tree: ' + tree['message_tree_id'])
            failed_ids.append(tree['message_tree_id'])
            continue
    return failed_ids

In [126]:
failed_ids = translate_dataset(data[:5182], translate, translations_path='../data/google_translate')

Translation failed for: ???


In [127]:
failed_ids

[]

## Check the transalted conversation

In [128]:
def generate_conversation(tree):
    conversation = []
    message = tree['prompt']
    conversation.append((message['role'], message['translation']))
    while len(message['replies']) > 0:
        message = message['replies'][0]
        conversation.append((message['role'], message['translation']))
    return conversation

In [129]:
tree_id = data[21]['message_tree_id']

with open('../data/google_translate/' + tree_id + '.json', 'r') as f:
    translated = json.load(f)

In [130]:
conv = generate_conversation(translated)
for role, line in conv:
    print(f'{role.upper()}: {line}')

PROMPTER: Kako naj ustvarim skript za mešalnik, ki zamenja vse predmete z določenim imenom s točkovno svetlobo?
ASSISTANT: Tukaj je primer skripta Python za Blender, ki zamenja vse predmete z določenim imenom (v tem primeru "Cube") s točkovno svetlobo:

```
python

import bpy

# Set the name of the object you want to replace
object_name = "Cube"

# Create a new point light
light_data = bpy.data.lights.new(name="Point Light", type="POINT")
light_object = bpy.data.objects.new(name="Point Light", object_data=light_data)
bpy.context.scene.collection.objects.link(light_object)

# Find all objects with the specified name and replace them with the point light
for obj in bpy.context.scene.objects:
    if obj.name.startswith(object_name):
        obj_index = obj.pass_index # store the original index of the object for later use
        obj_data = obj.data
        obj_matrix = obj.matrix_world
        bpy.data.objects.remove(obj)
        light_object.pass_index = obj_index # assign the original i