#### We can use our best trained models to generate high quality sentences to be added to the SALT training corpus increasing the performance and quality of our models.

Backtranslation is translation of monolingual text from the target sentences to monolingual text in the source sentences, usually using a pretrained translation model. This data is then fed into the model's training data and used to finetune the model. Forward translation utilizes mono-lingual data in the source language, which is usually less effective, since the same model is used to translate the data and retrain on it.

In this case we use our pretrained English to Many and Many to English models. 

In [1]:
import datasets
import json
import os
import pandas as pd
import sacremoses
import transformers

from tqdm import tqdm

In [24]:
#find the necessary ggl language codes here https://cloud.google.com/translate/docs/languages
config = {
    "bt_src_langs": ["ach","lgg", "lug","nyn","teo"],
    "bt_tgt_langs": ["eng"],
    "model_name": "BT_14_3_m2en",
    "model_checkpoint": "/content/drive/MyDrive/john_models/saved_models/marianmt-many-eng/marianmt-many-eng-checkpoint-700/",
    'max_input_length': 128,

    "monolingual_data_dir": "parallel-text-EDA/back_translation/data",
}

config["output_data_dir"] = "/content/drive/MyDrive/BT/MarianMT_M2E_14_3_23"


In [5]:
## Todo: Add config dict
sentences = {language:[] for language in config["bt_src_langs"]}
languages_to_bt = os.listdir(config["monolingual_data_dir"])
for bt_language in languages_to_bt:
    if bt_language in config["bt_src_langs"]:
        language_dir = os.path.join(config["monolingual_data_dir"], bt_language)
        files_to_read = os.listdir(language_dir)
        for file_to_read in files_to_read:
            if file_to_read.startswith("."):
                continue
            with open(os.path.join(language_dir,file_to_read)) as lfd: 
                print(file_to_read)
                lines = lfd.readlines()
            sentences[bt_language].extend(lines)

lug_data_raw = datasets.Dataset.from_dict({"lug":sentences["lug"]})
ach_data_raw = datasets.Dataset.from_dict({"ach":sentences["ach"]})

bukedde.txt
makerere.txt
misc.txt
acholi-online.txt
rupiny.txt


In [6]:
lug_data_raw

Dataset({
    features: ['lug'],
    num_rows: 12304
})

In [7]:
model = transformers.AutoModelForSeq2SeqLM.from_pretrained(config['model_checkpoint'])
tokenizer = transformers.AutoTokenizer.from_pretrained(config['model_checkpoint'])
data_collator = transformers.DataCollatorForSeq2Seq(tokenizer, model = model) 
metric = datasets.load_metric('sacrebleu')

  metric = datasets.load_metric('sacrebleu')


In [10]:
import torch
from tqdm import tqdm

lug_results = []
for text_idx in tqdm(range(0,len(lug_data_raw), 128)):

    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    try:
        encoded = tokenizer(
            lug_data_raw[text_idx:text_idx+128]["lug"],padding=True, return_tensors="pt").to(device)
    except IndexError:
        encoded = tokenizer(
            lug_data_raw[text_idx:]["lug"],padding=True, return_tensors="pt").to(device)

    generated_tokens = model.to(device).generate(**encoded, max_length=128)
    result = tokenizer.batch_decode(
        generated_tokens.cpu(), skip_special_tokens=True)
    lug_results.extend(result)




100%|██████████| 97/97 [13:53<00:00,  8.59s/it]


In [27]:
language_dir = os.path.join(config["output_data_dir"], "lug" )
if not os.path.exists(language_dir):
    os.mkdir(language_dir)

assert len(lug_results) == len(lug_data_raw)

filepath = os.path.join(language_dir, "bt_lug_en.jsonl")
with open(filepath, "w" ) as outfile:

    for idx in range(len(lug_results)):
        dict_to_dump = {
        "text": {
            "lug": lug_data_raw[idx]["lug"],
            "eng": lug_results[idx]
            }
        }
        json.dump(dict_to_dump, outfile)
        outfile.write('\n')




In [11]:
ach_results = []
for text_idx in tqdm(range(0,len(ach_data_raw), 128)):

    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    try:
        encoded = tokenizer(
            ach_data_raw[text_idx:text_idx+128]["ach"],padding=True, return_tensors="pt").to(device)
    except IndexError:
        encoded = tokenizer(
            ach_data_raw[text_idx:]["ach"],padding=True, return_tensors="pt").to(device)

    generated_tokens = model.to(device).generate(**encoded, max_length=128)
    result = tokenizer.batch_decode(
        generated_tokens.cpu(), skip_special_tokens=True)
    ach_results.extend(result)


100%|██████████| 52/52 [07:03<00:00,  8.14s/it]


In [28]:
language_dir = os.path.join(config["output_data_dir"], "ach" )
if not os.path.exists(language_dir):
    os.mkdir(language_dir)

assert len(ach_results) == len(ach_data_raw)

filepath = os.path.join(language_dir, "bt_ach_en.jsonl")
with open(filepath, "w" ) as outfile:

    for idx in range(len(ach_results)):
        dict_to_dump = {
        "text": {
            "ach": ach_data_raw[idx]["ach"],
            "eng": ach_results[idx]
            }
        }
        json.dump(dict_to_dump, outfile)
        outfile.write('\n')


