## Imports and setup

In [24]:
from IPython import display
!pip install transformers
!pip install sacrebleu
!pip install sacremoses
!pip install datasets
!pip install wandb
!pip install sentencepiece
display.clear_output()

In [3]:
import datasets
from IPython import display
import numpy as np
import os
import pandas as pd
import random
import sentencepiece
import sacrebleu
import sacremoses
import tqdm
import transformers
import torch
import wandb

In [13]:
torch.cuda.is_available()

True

## Configuration

Alternatives for pre-training when translating to English: `Helsinki-NLP/opus-mt-lg-en`, `Helsinki-NLP/opus-mt-mul-en`.

Note 1: when training on V100 GPUs, there is more memory and `train_batch_size` can be increased (to 64?). If this is done then `gradient_accumulation_steps` should then be decreased accordingly, so that there is the same effective batch size.

Note 2: there is little difference in BLEU score when using a test set of 500 vs 1000 sentences per language. For rapid parameter tuning, we can therefore use `config['validation_samples_per_language'] = 500`, and then set it to 1000 for the best model config to report numbers in the paper.

In [14]:
# Parameters for mul-en models
config = {
    'source_language': 'en',
    'target_language': 'mul',
    'metric': 'sacrebleu',
    'model_checkpoint': 'Helsinki-NLP/opus-mt-en-mul',
    'train_batch_size': 16,
    'gradient_accumulation_steps': 16,
    'eval_batch_size': 16,
    'max_input_length': 128,
    'max_target_length': 128,
    'input_prefix': '',
    'validation_samples_per_language': 500,
    'eval_languages': ["ach", "lgg", "lug", "nyn", "teo"],
    'learning_rate': 1e-4,
    'num_train_epochs': 20,
    'label_smoothing_factor': 0.1,
}

# Adjustments for en-mul models
if config['target_language'] == 'mul':
    config['learning_rate'] = 5e-4
    config['gradient_accumulation_steps'] = 64


config['language_pair'] = f'{config["source_language"]}-{config["target_language"]}'
config['wandb_project'] = f'sunbird-translate-{config["language_pair"]}'
config['model_checkpoint'] = f'Helsinki-NLP/opus-mt-{config["language_pair"]}'

# Evaluate roughly every 10 minutes
eval_steps_interval = 220 * 60 * 7 / (config['gradient_accumulation_steps']
                                      * config['train_batch_size'])

eval_steps_interval = 100 * max(1, int(eval_steps_interval / 100))

print(f'Evaluating every {eval_steps_interval} training steps.')

config['train_settings'] = transformers.Seq2SeqTrainingArguments(
    f'output-{config["language_pair"]}',
    evaluation_strategy = 'steps',
    eval_steps = eval_steps_interval,
    save_steps = eval_steps_interval,
    gradient_accumulation_steps = config['gradient_accumulation_steps'],
    learning_rate = config['learning_rate'],
    per_device_train_batch_size = config['train_batch_size'],
    per_device_eval_batch_size = config['eval_batch_size'],
    weight_decay = 0.01,
    save_total_limit = 3,
    num_train_epochs = config['num_train_epochs'],
    predict_with_generate = True,
    fp16 = torch.cuda.is_available(),
    logging_dir = f'output-{config["language_pair"]}',
    report_to = 'wandb',
    run_name = f'{config["source_language"]}-{config["target_language"]}',
    load_best_model_at_end=True,
    metric_for_best_model = 'loss',
    label_smoothing_factor = config['label_smoothing_factor']
)


Evaluating every 100 training steps.


# Set up datasets

Download the raw text data.

In [15]:
if not os.path.exists('v7-dataset'):
    !wget https://sunbird-translate.s3.us-east-2.amazonaws.com/v7-dataset.zip
    !unzip v7-dataset.zip
    display.clear_output()

Create a training set by interleaving separate training subsets.

Notes:
* This includes MT560 which has many examples (484,925), but which is biased towards religious text so we sample from it sparsely.
* We just use a 2-way train/test split for this experiment, so include the validation sentences in with the training set.
* LGG, ACH and TEO are oversampled a little by duplicating the validation sets, as a simple way to correct for there being more LUG and NYN training data.

In [16]:
DATA_DIR = f'v7-dataset/v7.0/supervised/{config["language_pair"]}/'
TRAINING_SUBSET_IDS = ['train', 'train_mt560', 'train_ai4d', 'train_flores',
                       'val_ach', 'val_lgg', 'val_lug', 'val_nyn', 'val_teo',
                       'val_ach', 'val_teo', 'val_lgg']
TRAINING_SUBSETS_SAMPLE_RATIO = [83770, 80000, 15021, 4018,
                                 4126, 4126, 4126, 4126, 4126,
                                 4126, 4126, 4126]

def _file_to_list(path):
    with open(path) as file:
        lines = file.readlines()
        lines = [line.rstrip() for line in lines]
        return lines
    
def dataset_from_src_tgt_files(data_dir, dataset_id, read_first_n = 0):
    path = os.path.join(data_dir, dataset_id)
    source, target = [_file_to_list(path + '.src'), _file_to_list(path + '.tgt')]
    if read_first_n:
        source = source[:read_first_n]
        target = target[:read_first_n]
    pairs = {'translation': [{config['source_language']: s, config['target_language']: t}
                             for s, t in zip(source, target)]}
    return datasets.Dataset.from_dict(pairs)

In [17]:
training_subsets = [dataset_from_src_tgt_files(DATA_DIR, id) for id in TRAINING_SUBSET_IDS]
training_subsets = [s.shuffle() for s in training_subsets]
sample_probabilities = np.array(TRAINING_SUBSETS_SAMPLE_RATIO) / np.sum(TRAINING_SUBSETS_SAMPLE_RATIO)
train_data_raw = datasets.interleave_datasets(training_subsets, sample_probabilities)

Make the separate validation sets

In [18]:
validation_subsets = [dataset_from_src_tgt_files(
    DATA_DIR, f'test_{lang}', read_first_n = config['validation_samples_per_language'])
    for lang in config['eval_languages']]
validation_data_raw = datasets.concatenate_datasets(validation_subsets)

## Helper functions

Note that whatever pre-processing we do here (punctuation normalisation and ensuring sentence case), we should also do at test-time when running the model on real queries.

In [11]:
def sentence_format(input):
    '''Ensure capital letter at the start and full stop at the end.'''
    input = input[0].capitalize() + input[1:]
    if input[-1] not in ['.', '!', '?']:
        input = input + '.'
    return input

def preprocess(examples):
    normalizer = sacremoses.MosesPunctNormalizer()
    
    inputs = [ex[config['source_language']] for ex in examples['translation']]
    targets = [ex[config['target_language']] for ex in examples['translation']]

    inputs = [sentence_format(normalizer.normalize(text))
              for text in inputs]
    targets = [sentence_format(normalizer.normalize(text))
               for text in targets]
    
    model_inputs = tokenizer(
        inputs, max_length=config['max_input_length'], truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets, max_length=config['max_target_length'], truncation=True)

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

def postprocess(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds, eval_languages, samples_per_language):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
        
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess(decoded_preds, decoded_labels)
    
    result = {}
    for i, lang in enumerate(eval_languages):
        result_subset = metric.compute(
            predictions=decoded_preds[i*samples_per_language:(i+1)*samples_per_language],
            references=decoded_labels[i*samples_per_language:(i+1)*samples_per_language])
        result[f"BLEU_{lang}"] = result_subset["score"]
        
    result["BLEU_mean"] = np.mean([result[f"BLEU_{lang}"] for lang in eval_languages])
    
    result = {k: round(v, 4) for k, v in result.items()}
    return result

# Training

Instantiate the model and tokenizer.

In [20]:
model = transformers.AutoModelForSeq2SeqLM.from_pretrained(config['model_checkpoint'])
tokenizer = transformers.AutoTokenizer.from_pretrained(config['model_checkpoint'])
data_collator = transformers.DataCollatorForSeq2Seq(tokenizer, model = model) 
metric = datasets.load_metric(config['metric'])

For multiple language outputs, we need to make sure the language codes have some mapping in the encoder. We can re-use the token indices of some other language codes in the pre-trained model that we don't need.

In `Helsinki-NLP/opus-mt-en-mul`, only Luganda (`lug`) is already supported.

In [21]:
if config['target_language'] == 'mul':
    replacement_codes = {'nyn': 'kin',
                         'lgg': 'lin',
                         'ach': 'tso',
                         'teo': 'som',
                         'luo': 'sna'}
    for r in replacement_codes:
        if (f'>>{r}<<' not in tokenizer.encoder and
            f'>>{replacement_codes[r]}<<' in tokenizer.encoder):
            tokenizer.encoder[f">>{r}<<"] = tokenizer.encoder[f">>{replacement_codes[r]}<<"]
            del tokenizer.encoder[f">>{replacement_codes[r]}<<"]

    # Check that all the evaluation language codes are mapped to something.
    for r in config['eval_languages']:
        if f'>>{r}<<' not in tokenizer.encoder:
            raise ValueError(f'Language code {r} not found in the encoder.')

Pre-process the raw text datasets.

In [25]:
train_data  = train_data_raw.map(
    preprocess, remove_columns=["translation"], batched=True)

validation_data  = validation_data_raw.map(
    preprocess, remove_columns=["translation"], batched=True)

  0%|          | 0/212 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

Launch the training.

In [26]:
wandb.init(project=config['wandb_project'], config=config)

trainer = transformers.Seq2SeqTrainer(
    model,
    config['train_settings'],
    train_dataset = train_data,
    eval_dataset = validation_data,
    data_collator = data_collator,
    tokenizer = tokenizer,
    compute_metrics = lambda x: compute_metrics(
        x, config['eval_languages'], config['validation_samples_per_language']),
    callbacks = [transformers.EarlyStoppingCallback(early_stopping_patience = 3)],
)

[34m[1mwandb[0m: Currently logged in as: [33mjqug[0m (use `wandb login --relogin` to force relogin)


Using amp fp16 backend


In [27]:
trainer.train()

***** Running training *****
  Num examples = 211923
  Num Epochs = 20
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 1024
  Gradient Accumulation steps = 64
  Total optimization steps = 4120
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,Bleu Ach,Bleu Lgg,Bleu Lug,Bleu Nyn,Bleu Teo,Bleu Mean
100,No log,3.309345,8.5939,5.1237,17.2365,8.0258,6.3979,9.0755
200,No log,3.019797,11.0189,10.405,18.4485,10.3602,11.4533,12.3372
300,No log,2.909963,12.5116,13.3112,19.76,11.3438,13.5159,14.0885
400,No log,2.842729,14.1259,14.6068,20.6028,11.4466,15.1027,15.1769
500,3.194600,2.801257,14.2744,14.8799,22.8441,12.0802,15.3984,15.8954
600,3.194600,2.767067,14.7736,16.0145,22.1762,12.53,16.4693,16.3927
700,3.194600,2.749725,15.8938,16.4915,22.8989,12.2963,17.3813,16.9923
800,3.194600,2.71672,16.0911,17.2798,23.3419,12.6728,16.9516,17.2674
900,3.194600,2.713619,16.3476,17.2645,23.4512,13.297,17.5567,17.5834
1000,2.622800,2.702202,15.7649,17.6676,24.0574,13.1839,17.6657,17.6679


***** Running Evaluation *****
  Num examples = 5000
  Batch size = 16
Saving model checkpoint to output-en-mul/checkpoint-100
Configuration saved in output-en-mul/checkpoint-100/config.json
Model weights saved in output-en-mul/checkpoint-100/pytorch_model.bin
tokenizer config file saved in output-en-mul/checkpoint-100/tokenizer_config.json
Special tokens file saved in output-en-mul/checkpoint-100/special_tokens_map.json
Deleting older checkpoint [output-en-mul/checkpoint-1600] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 5000
  Batch size = 16
Saving model checkpoint to output-en-mul/checkpoint-200
Configuration saved in output-en-mul/checkpoint-200/config.json
Model weights saved in output-en-mul/checkpoint-200/pytorch_model.bin
tokenizer config file saved in output-en-mul/checkpoint-200/tokenizer_config.json
Special tokens file saved in output-en-mul/checkpoint-200/special_tokens_map.json
Deleting older checkpoint [output-en-mul/checkpoint-1800] due t

TrainOutput(global_step=1700, training_loss=2.703928653492647, metrics={'train_runtime': 11309.4871, 'train_samples_per_second': 374.77, 'train_steps_per_second': 0.364, 'total_flos': 4.546194863652864e+16, 'train_loss': 2.703928653492647, 'epoch': 8.25})

## Test translation

To use a different model version, checkpoints can be loaded as follows. 

Saved models are available in [this Drive folder](https://drive.google.com/drive/folders/1XJBKEPwwwwaSOoFdT4SL4EcFN6iPNPcg?usp=sharing), named according to the random `wandb.ai` title.

In [24]:
checkpoint_path = "savedmodels/en-mul-ethereal-valley"
model = transformers.AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

Example of using an `en-mul` model. Note that we have to do the same pre-processing as was used to train the model.

In [25]:
def translate(source_text, model, tokenizer):
    device = torch.device('cpu')
    model = model.eval()
    model = model.to(device) 
    inputs = tokenizer(source_text, return_tensors="pt").to(device)
    tokens = model.generate(**inputs)
    result = tokenizer.decode(tokens.squeeze(), skip_special_tokens=True)
    return result

def translate_to_multiple(query,
                          languages = {'ach': 'Acholi', 'lgg': 'Lugbara', 'lug': 'Luganda','nyn': 'Runyankore', 'teo': 'Ateso'}):
    normalizer = sacremoses.MosesPunctNormalizer()
    normalized_query = sentence_format(normalizer.normalize(query))
    
    translations = []
    for lang in ['lug', 'ach', 'nyn', 'teo', 'lgg']:
        translation = translate(f">>{lang}<< {normalized_query}", model, tokenizer)
        translations.append({'target': languages[lang], 'translation': translation})

    df = pd.DataFrame(translations)
    df = df.set_index('target')   
    return df

In [32]:
query = "how many people are there in Kampala?"

translate_to_multiple(query)

Unnamed: 0_level_0,translation
target,Unnamed: 1_level_1
Luganda,Mu Kampala mulimu abantu bameka?
Acholi,Dano adi matye I Kampala?
Runyankore,Omuri Kampala harimu abantu bangahi?
Ateso,Itunga bo idi ejaas Kampala?
Lugbara,Ba Kampalaa 'diyi ma kalafe si?
