In [None]:
import numpy as np
import os
import pandas as pd
import sacremoses
from tqdm.notebook import tqdm
import transformers
from transformers import AutoTokenizer
import torch
import glob
from Levenshtein import distance as levenshtein_distance

In [None]:
torch.cuda.is_available()

In [None]:
device = torch.device('cuda')

mul_en_checkpoint_path = "savedmodels/mul-en"
mul_en_model = transformers.AutoModelForSeq2SeqLM.from_pretrained(
    mul_en_checkpoint_path)
mul_en_model = mul_en_model.eval().to(device) 
mul_en_tokenizer = AutoTokenizer.from_pretrained(
    mul_en_checkpoint_path)

In [None]:
en_mul_checkpoint_path = "savedmodels/en-mul"
en_mul_model = transformers.AutoModelForSeq2SeqLM.from_pretrained(
    en_mul_checkpoint_path)
en_mul_model = mul_en_model.eval().to(device) 
en_mul_tokenizer = AutoTokenizer.from_pretrained(
    en_mul_checkpoint_path)

## Translate Luganda and Acholi text for training en-mul model

First, read in the single-language text.

In [None]:
def file_to_list(path):
    with open(path) as file:
        lines = file.readlines()
        lines = [line.rstrip() for line in lines]
        return lines
    
codes = ['ach', 'lug', 'teo']
text = {}
for code in codes:
    text[code] = []
    files = glob.glob(f'back_translation/{code}/*.txt')
    for f in files:
        text[code].extend(list(set(file_to_list(f))))

Use the mul-en model to get English translations.

In [None]:
# Split the text up into batches that will fit on the GPU
normalizer = sacremoses.MosesPunctNormalizer()

def batch_translate(text, batch_size = 20):
    translations = []
    batches = [
        text[i:i + batch_size]
        for i in range(0, len(text), batch_size)
    ]

    for batch in tqdm(batches):
        batch = [normalizer.normalize(t) for t in batch]

        inputs = mul_en_tokenizer(
            batch, return_tensors="pt",
            padding=True, truncation=True,
            max_length=128).to(device)
        tokens = mul_en_model.generate(**inputs)
        result = [mul_en_tokenizer.decode(
            t.squeeze(), skip_special_tokens=True)
                  for t in tokens]
        translations.extend(result)
    return translations

source = []
target = []

for code in codes:
    translations = batch_translate(text[code])
    source.extend([f'>>{code}<< {t}' for t in translations])
    target.extend(text[code])

If for any examples the source is supiciously similar to the target, then it may actually be English rather than a local language. Filter these out.

In [None]:
def sentence_format(input, first_word_title_case = True): 
    '''Ensure capital letter at the start and full stop at the end.'''
    input = input[0].capitalize() + input[1:]
    if input[-1] not in ['.', '!', '?']:
        input = input + '.'
    return input

filtered_source = []
filtered_target = []

for s, t in zip(source, target):
    d = levenshtein_distance(s, t)
    might_be_english = (len(t) > 30) and ((d / len(t)) < 0.4)  
    if not might_be_english:
        filtered_source.append(s)
        filtered_target.append(sentence_format(t))

Create the back-translation training data files.

In [None]:
with open("v7-dataset/v7.0/supervised/en-mul/back_translated.src", "w") as f:
    f.writelines('\n'.join(filtered_source))
with open("v7-dataset/v7.0/supervised/en-mul/back_translated.tgt", "w") as f:
    f.writelines('\n'.join(filtered_target))

## Translate English text for training mul-en model

In [None]:
eng_text = file_to_list('back_translation/eng/daily-monitor.txt')
eng_text = list(set(eng_text))

For each English sentence, choose one of the five other languages randomly to translate to.

In [None]:
source = []
target = []

# Split the text up into batches that will fit on the GPU
batch_size = 20
batches = [
    eng_text[i:i + batch_size]
    for i in range(0, len(eng_text), batch_size)
]

for batch in tqdm(batches):
    batch = [sentence_format(normalizer.normalize(t))
             for t in batch]

    # Randomly select language codes
    target_codes = np.random.choice(
        language_codes, len(batch), replace=True)
    source_text = [f'>>{code}<< {t}'
                   for t, code in zip(batch, target_codes)]

    inputs = en_mul_tokenizer(
        source_text, return_tensors="pt",
        padding=True, truncation=True,
        max_length=128).to(device)
    tokens = en_mul_model.generate(**inputs)
    translations = [en_mul_tokenizer.decode(
        t.squeeze(), skip_special_tokens=True)
              for t in tokens]

    source.extend(translations)
    target.extend(batch)

In [None]:
with open("v7-dataset/v7.0/supervised/mul-en/back_translated.src", "w") as f:
    f.writelines('\n'.join(source))
with open("v7-dataset/v7.0/supervised/mul-en/back_translated.tgt", "w") as f:
    f.writelines('\n'.join(target))