#### We can use the google API to forward and backtranslate data leading to an indirect distillation/piggybacking of the trained google models.

Backtranslation is translation of monolingual text from the target sentences to monolingual text in the source sentences, usually using a pretrained translation model. This data is then fed into the model's training data and used to finetune the model. Forward translation utilizes mono-lingual data in the source language, which is usually less effective, since the same model is used to translate the data and retrain on it.

In this case we use google's translation API, therefore both directions are valid. Since we are training on the translation results of a different model.

In [None]:
import json
import os
import pandas as pd

from google.cloud import translate_v2
from tqdm import tqdm


In [None]:
#find the necessary ggl language codes here https://cloud.google.com/translate/docs/languages
config = {
    "source_language_ggl_token": "en",
    "source_language_iso_token": "eng",
    "target_language_ggl_token": "lg",
    "target_language_iso_token": "lug",
    "monolingual_data_dir": "back_translation/data",
    "output_data_dir": "back_translation/google_bt",
    #"temp_data_dir": "/temp/temp_bt_files"
}

In [None]:
## Todo: Add config dict
sentences = []
files_to_code = os.listdir(config["monolingual_data_dir"])
for file_to_read in files_to_code:
    if file_to_read.startswith("."):
        continue
    with open(os.path.join(config["monolingual_data_dir"],file_to_read)) as lfd: 
        print(file_to_read)
        lines = lfd.readlines()
    sentences.extend(lines)

: 

Estimated cost per run (7/3/2023 pricing)

First 500k characters are at a standard rate of 10$
500k-1bn characters are at 20$/10^6 characters using the Basic Translation API and no src language detection. 

This is the pricing we will use to calculate.

In [None]:

total_count = 0
for sentence in tqdm(sentences):
    total_count += len(sentence)

total_price = (total_count/1000000)*20
print(f"Estimated cost of this run is ${total_price}")


: 

In [None]:
path_to_google_srv_account_json = "/enter/your/own/path/here"
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = path_to_google_srv_account_json

# or comment the previous lines and use the environment variable directly from bash, docker, etc.


: 

Create temp versions of the data to avoid corrupting the monolingual dataset in case of an error

In [None]:

translate_client = translate_v2.Client()

: 

In [None]:
## It is wise to start with a smaller subset to ensure everything works, before running the whole batch and finding out there is an error after credits have been spent

starting_idx = 0
end_index = 50 #len(sentences)
step_size = 10 #100

combined_file = os.path.join(config["output_data_dir"], f"combined-{config['source_language_iso_token']}-{config['target_language_iso_token']}-data.txt" )

In [None]:
from tqdm import tqdm
from time import sleep
translated_scentences = []


def _get_string(line_dict):
    return "".join([line_dict["translatedText"], "__SEP__", line_dict["input"]])

for current_idx in tqdm(range(starting_idx, end_index,step_size )):
    prev_length = len(translated_scentences)
    print(f"Current IDX: {current_idx} END_IDX: {end_index} STEPSIZE: {step_size}")
    if current_idx+step_size < end_index:
        segment_to_translate = sentences[current_idx:current_idx+step_size]
    else: 
        segment_to_translate = sentences[current_idx:end_index]

    try:
        fresh_off_the_translator = translate_client.translate(
            segment_to_translate,
            source_language=config["source_language_ggl_token"], 
            target_language=config["target_language_ggl_token"]
            )
        sleep(5)
        translated_scentences.extend(fresh_off_the_translator)
        if type(fresh_off_the_translator) == list:
            for line_dict in fresh_off_the_translator: 
                str_to_write = _get_string(line_dict)
                with open(combined_file, "a") as tfd:
                    tfd.write(str_to_write + "\n")
        elif type(fresh_off_the_translator) == dict:
            str_to_write = _get_string(fresh_off_the_translator)
            with open(combined_file, "a") as tfd:
                tfd.write(str_to_write + "\n")
        else:
            raise ValueError("Unexpected Value")
    except Exception as e: 
        print(e)
        print("stopped at: " + str(current_idx))
        break


In [None]:
words_df = pd.read_csv(combined_file, delimiter="__SEP__", names=[config["source_language_iso_token"], config["target_language_iso_token"]])
output_file_name = f'backtranslated-from-{config["source_language_iso_token"]}-to-{config["target_language_iso_token"]}.jsonl'
path_to_output = os.path.join(config["output_data_dir"], output_file_name)

for sentence_idx in range(len(words_df)):
    src_sentence = words_df.iloc[sentence_idx][config["source_language_iso_token"]]
    tgt_sentence = words_df.iloc[sentence_idx][config["target_language_iso_token"]]
    dict_to_dump = {
        "text": {
            config["source_language_iso_token"]: src_sentence,
            config["target_language_iso_token"]: tgt_sentence
        }
    }

    with open(path_to_output, "a") as fd:
        #with open("test.txt", "a") as fd:
        fd.write(json.dumps(dict_to_dump)+"\n")

: 