#### Import required modules and load alpaca_data_cleaned file

In [3]:
!pip install deepl
!pip install openai


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m


#### Choose the translator you would like to use

In [4]:
%pip install deepl


Collecting deepl
  Using cached deepl-1.15.0-py3-none-any.whl (32 kB)
Installing collected packages: deepl
Successfully installed deepl-1.15.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.10 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [5]:
TRANSLATOR = "deepl" # or openai

#### Authenticate to deepl

In [10]:
import deepl

TARGET_LANG="ES" # e.g. DE, EN,.. 
FORMALITY="less" # 

auth_key = "0945e2ef-b902-3627-61ca-43785203ae37:fx"  # replace with your key
translator = deepl.Translator(auth_key)

#### Setup OpenAI information

In [9]:
import openai

openai.api_key = "" # replace with your key
MODEL = "gpt-3.5-turbo"
TARGET_LANGUAGE = "Spanish" # e.g. "English", "German", "Spanish"

In [17]:
import json
import pandas as pd
import numpy as np
import re
import glob
import os

input_tasks_path = "data/source_tasks/tasks_translated_en.json"

with open(input_tasks_path, "rb") as f:
    json_data = json.loads(f.read())
    df = pd.DataFrame(json_data)
    
def write_json_file(blob, file_path):
    with open(file_path, 'w') as file:
            json.dump(blob, file)

### Start translating dataset

#### util functions that help avoid translating content that is not intended for translation

In [13]:
def matches_regex(regex, text):
    return bool(re.compile(regex).search(text))


def contains_code(text):
    # filter based on keywords that indicate code
    code_blacklist = ['&&', '||', '<html>', ';\n', 'SELECT']
    
    return (
            any(code_keyword in text for code_keyword in code_blacklist) |
            matches_regex(r'\w+\(\w*\) \{', text) | # e.g. myFunc() {
            matches_regex(r'def \w+\(', text) | # e.g. def parse_list(
            matches_regex(r'\[A-z]+\.[A-z]+', text) | # e.g. this.language
            matches_regex(r': [\w\.#]{1,12};', text) | # e.g. font-size: 1.3em;
            matches_regex(r'<\/\w+>', text) # e.g. </html>
           )


def contains_words(text):
    return matches_regex(r'[A-z]{3,}', text) # words with at least three characters


def is_translatable(text):
    if text == "":
        return True # empty string won't be charged by DeepL
    return (contains_code(text) is False) & contains_words(text)

#### util functions to translate individual columns (instruction, input and output) of each chunck as a list

In [14]:
def translate_and_update_series(text_series):
    # memorize whether and where the list contains non-translatable content
    is_translatable_index = text_series.apply(lambda x: is_translatable(x) is False)
    text_list_source_language = text_series.tolist()

    # replace non-translatable content with an empty string
    text_series[is_translatable_index] = ""

    # translate list
    text_list = text_series.tolist()
    if TRANSLATOR == "deepl":
        translated_list = translate_list_deepl(text_list)
    else:
        translated_list = translate_list_openai(text_list)

    # if list contains non-translatable content, replace accordingly
    if is_translatable_index.sum() > 0:
        for index, text_is_translatable in enumerate(is_translatable_index.tolist()):
            if text_is_translatable:
                translated_list[index] = text_list_source_language[index]
    return translated_list

def create_openai_prompt_string(text):
    if ' ' in text:
        return f'Please provide the {TARGET_LANGUAGE} translation for these sentences: {text}'
    else:
        return f'Please provide the {TARGET_LANGUAGE} translation for the following word: {text}'

def create_openai_message_list(text_list):
    return [None if text == "" else {"role": "user", "content": create_openai_prompt_string(text)} for text in text_list]

def translate_openai_message(message):
    if message is None:
        return ""
    
    response = None
    while response is None:
        try:
            response = openai.ChatCompletion.create(
                model=MODEL,
                messages=[message]
            )
        except:
            pass
    return response["choices"][0]["message"]["content"].strip()

def translate_list_openai(text_list):
    message_list = create_openai_message_list(text_list)
    return [translate_openai_message(message) for message in message_list]

def translate_list_deepl(text_list):
    # here would be the place to replace the DeepL library with the Google library for example
    combined_response = translator.translate_text(text_list, source_lang="EN", target_lang=TARGET_LANG, formality=FORMALITY)
    return [response.text for response in combined_response]

#### Divide dataframe into chunks and translate the chunks sequentially

I'm sure this part can be heavily improved (feel free to create a pull request)

In [15]:
# Feel free to increase the chunk size. I was worried that the execution would be interrupted,
# so I used a smaller chunk size
chunk_size = 5
output_dir = './data/output/'

def translate_dataframe(df):
    os.makedirs(output_dir, exist_ok=True)
    number_of_chunks = df.shape[0] // chunk_size
    chunked_df_list = np.array_split(df, number_of_chunks)
    
    start_index = 1
    
    for index, chunk_df in enumerate(chunked_df_list[start_index:]):
        instruction_list_translated = translate_and_update_series(chunk_df.instruction)
        input_list_translated = translate_and_update_series(chunk_df.input)
        output_list_translated = translate_and_update_series(chunk_df.output)
        
        translated_df = pd.DataFrame({'instruction': instruction_list_translated, 'input': input_list_translated, 'output': output_list_translated})
        translated_dict = translated_df.to_dict('records')
        
        write_json_file(translated_dict, f'{output_dir}chunk{start_index+index}.json')

#### Start translating the DataFrame (Warning: Run this cell carefully)

In [19]:
translate_dataframe(df)

QuotaExceededException: Quota for this billing period has been exceeded, message: Quota Exceeded

#### Finally combine all chunked files into one translated task file

In [22]:
def combine_chunks():
    translated_tasks_list = []
    for index in range(1, len(glob.glob(f'{output_dir}*.json'))):
        with open(f'{output_dir}chunk{index}.json', "rb") as f:
            translated_tasks_list += json.loads(f.read())
    write_json_file(translated_tasks_list, f'./translated_tasks_es_{TRANSLATOR}.json')

combine_chunks()