In [1]:
%%capture
!pip install peft
!pip install -U transformers
!pip install -U datasets
!pip install bitsandbytes
!pip install accelerate
!pip install sentencepiece
!pip install sacremoses
!pip install wandb

# Later this will just be 'pip install leb'
!git clone -b new-dataset-functions https://github.com/jqug/leb.git
# The package requirements include transformers, datasets, etc
!pip install -r leb/requirements.txt

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import transformers
import datasets
import evaluate
import tqdm
import leb.dataset
import leb.utils
import leb.metrics
import yaml
import peft
import wandb
from IPython import display

In [None]:
if torch.cuda.is_available():
  !nvidia-smi

Thu Feb 29 10:00:12 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0              43W / 400W |      5MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
# define the artifacts directory for output files
drive_folder = "./artifacts"

if not os.path.exists(drive_folder):
  %mkdir $drive_folder

effective_train_batch_size = 3000
train_batch_size = 25
eval_batch_size = train_batch_size

gradient_accumulation_steps = int(effective_train_batch_size / train_batch_size)

# Everything in one yaml string, so that it can all be logged to MLFlow
yaml_config = '''
training_args:
  output_dir: "{drive_folder}"
  evaluation_strategy: steps
  eval_steps: 100
  save_steps: 100
  gradient_accumulation_steps: {gradient_accumulation_steps}
  learning_rate: 3.0e-4  # Include decimal point to parse as float
  optim: adafactor
  per_device_train_batch_size: {train_batch_size}
  per_device_eval_batch_size: {eval_batch_size}
  weight_decay: 0.01
  save_total_limit: 3
  max_steps: 1500
  predict_with_generate: True
  fp16: True
  logging_dir: "{drive_folder}"
  load_best_model_at_end: True
  metric_for_best_model: loss
  seed: 42
  hub_model_id: nllb-1.3B-many-to-many-pronouncorrection-caseaugmentation
  push_to_hub: True

mlflow_run_name: translate-many-eng
max_input_length: 128
eval_pretrained_model: False
early_stopping_patience: 4
data_dir: .
model_checkpoint: jq/nllb-1.3b-many-to-many-step800

datasets:
  train:
    huggingface_load:
      - path: sunbird/salt
        name: text-all
        split: train
      - path: Sunbird/external_mt_datasets
        name: ai4d.parquet
        split: train
      - path: Sunbird/external_mt_datasets
        name: flores200.parquet
      - path: Sunbird/external_mt_datasets
        name: lafand-en-lug-combined.parquet
      - path: Sunbird/external_mt_datasets
        name: lafand-en-luo-combined.parquet
      - path: Sunbird/external_mt_datasets
        name: mozilla_110.parquet
      - path: Sunbird/external_mt_datasets
        name: mt560_ach.parquet
      - path: Sunbird/external_mt_datasets
        name: mt560_lug.parquet
      - path: Sunbird/external_mt_datasets
        name: mt560_nyn.parquet
      - path: Sunbird/external_mt_datasets
        name: bt_from-eng-google.parquet
      - path: Sunbird/external_mt_datasets
        name: bt_from-lug-google.parquet
      - path: Sunbird/external_mt_datasets
        name: bt_ach_en_14_3_23.parquet
      - path: Sunbird/external_mt_datasets
        name: bt_en_many_30_3.parquet
      - path: Sunbird/external_mt_datasets
        name: bt_lug_en_14_3_23.parquet
    source:
      type: text
      language: [ach,lgg,lug,nyn,teo,eng]
      preprocessing:
        - clean_text
        - match_target_sentence_format_to_source
        - random_case
        - augment_characters
        - prefix_dataset_tag:
            tags:
              mt560: '<mt560>'
              bt_: '<bt>'
    target:
      type: text
      language: [ach,lgg,lug,nyn,teo,eng]
      preprocessing:
        - clean_text
    shuffle: True
    src_or_tgt_languages_must_contain: eng  # Limit to xx->eng, eng->xx
    allow_same_src_and_tgt_language: False

  validation:
    huggingface_load:
      path: sunbird/salt
      name: text-all
      split: dev  # optionally use a slice, e.g. dev[:10] for a quick test
    source:
      type: text
      language: [ach,lgg,lug,nyn,teo,eng]
      preprocessing:
        - clean_text
        - match_target_sentence_format_to_source
    target:
      type: text
      language: [ach,lgg,lug,nyn,teo,eng]
      preprocessing:
        - clean_text
    src_or_tgt_languages_must_contain: eng  # Limit to xx->eng, eng->xx
    allow_same_src_and_tgt_language: False
'''

yaml_config = yaml_config.format(
    drive_folder=drive_folder,
    train_batch_size=train_batch_size,
    eval_batch_size=eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
)

config = yaml.safe_load(yaml_config)

training_settings = transformers.Seq2SeqTrainingArguments(
    **config["training_args"])

In [None]:
model = leb.utils.TrainableM2MForConditionalGeneration.from_pretrained(
    config['model_checkpoint'])
tokenizer = transformers.NllbTokenizer.from_pretrained(
    'facebook/nllb-200-distilled-1.3B',
    src_lang='eng_Latn',
    tgt_lang='eng_Latn')

config.json:   0%|          | 0.00/864 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/89.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/483M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/154 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

the `lang_code_to_id` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder` this attribute will be removed in `transformers` v4.38
the `fairseq_tokens_to_ids` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder` this attribute will be removed in `transformers` v4.38
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
SALT_LANGUAGE_CODES = ["eng", "ach", "lgg", "lug", "nyn", "teo", "ibo", "swa"]

code_mapping = {
    # Exact/close mapping
    'eng': 'eng_Latn',
    'lug': 'lug_Latn',
    'ach': 'luo_Latn',
    'ibo': 'ibo_Latn',
    'swa': 'swh_Latn',
    # Random mapping
    'nyn': 'ace_Latn',
    'teo': 'afr_Latn',
    'lgg': 'aka_Latn',
}

offset = tokenizer.sp_model_size + tokenizer.fairseq_offset

for code in SALT_LANGUAGE_CODES:
    i = tokenizer.lang_code_to_id[code_mapping[code]]
    tokenizer.lang_code_to_id[code] = i
    tokenizer.fairseq_ids_to_tokens[i] = code

In [None]:
label_pad_token_id = -100
data_collator = transformers.DataCollatorForSeq2Seq(
    tokenizer,
    model = model,
    label_pad_token_id=label_pad_token_id,
)

In [None]:
def preprocess(examples):
    model_inputs = tokenizer(
        examples['source'],
        text_target=examples['target'],
        max_length=config['max_input_length'],
        truncation=True)

    # For NLLB models, set the language code for the sources and targets
    model_inputs['forced_bos_token_id'] = []
    for i in range(len(examples['source'])):
      source_language = examples['source.language'][i]
      target_language = examples['target.language'][i]
      model_inputs['input_ids'][i][0] = tokenizer.lang_code_to_id[
          source_language]
      target_language_token = tokenizer.lang_code_to_id[target_language]
      model_inputs['labels'][i][0] = target_language_token
      model_inputs['forced_bos_token_id'].append(target_language_token)

    return model_inputs


train_dataset = leb.dataset.create(config['datasets']['train'])
eval_dataset = leb.dataset.create(config['datasets']['validation'])

leb.utils.show_dataset(eval_dataset, N=5)

train_dataset = train_dataset.map(
    preprocess,
    batched=True,
    remove_columns=['source', 'source.language', 'target', 'target.language'])
eval_dataset = eval_dataset.map(
    preprocess,
    batched=True)

compute_metrics = leb.metrics.multilingual_eval_fn(
      eval_dataset, [evaluate.load('sacrebleu')],
      tokenizer, log_first_N_predictions=10)


Unnamed: 0,source,target,source.language,target.language
0,Tye yub mapat pat me nyayo lim i cing lwak ma nongo i lok me pur,There are a number of wealth creation programs around agriculture,ach,eng
1,Otita Lonyi eti otizani ꞌdiyi agrikica ma alia tre.,There are a number of wealth creation programs around agriculture.,lgg,eng
2,Waliwo pulogulaamu nnyingi ez'okweggya mu bwavu ezeetooloorera ku byobulimi n'obulunzi.,There are a number of wealth creation programs around agriculture.,lug,eng
3,Hariho puroguraamu nyingi z'okureetaho obugaiga omu by'obuhingi n'oburiisa,There are a number of wealth creation programs around agriculture,nyn,eng
4,Ipu ainapeta nuka adumun abar kotoma akoru.,There are a number of wealth creation programs around agriculture.,teo,eng


In [None]:
import wandb
wandb.init(project='sunbird-translate-multilingual', config=config)

In [None]:
transformers.generation.utils.ForcedBOSTokenLogitsProcessor = leb.utils.ForcedVariableBOSTokenLogitsProcessor

trainer = transformers.Seq2SeqTrainer(
  model,
  training_settings,
  train_dataset = train_dataset,
  eval_dataset = eval_dataset,
  data_collator = data_collator,
  tokenizer = tokenizer,
  compute_metrics = compute_metrics,
  callbacks = [
      transformers.EarlyStoppingCallback(
          early_stopping_patience = (config
           ['early_stopping_patience']))],
)

trainer.train()

In [None]:
tokenizer.push_to_hub('nllb-1.3B-many-to-many-step-2k')