In [1]:
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
import numpy as np
metric = evaluate.load("bleu")
source_lang = "dyu"
target_lang = "fr"
checkpoint = "facebook/nllb-200-distilled-600M"
zindi_ds = load_dataset("uvci/Koumankan_mt_dyu_fr")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#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
# !huggingface-cli login

In [3]:
import re
import sys
import unicodedata
from sacremoses import MosesPunctNormalizer

mpn = MosesPunctNormalizer(lang="en")
mpn.substitutions = [
    (re.compile(r), sub) for r, sub in mpn.substitutions
]

def get_non_printing_char_replacer(replace_by: str = " "):
    non_printable_map = {
        ord(c): replace_by
        for c in (chr(i) for i in range(sys.maxunicode + 1))
        # same as \p{C} in perl
        # see https://www.unicode.org/reports/tr44/#General_Category_Values
        if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"}
    }

    def replace_non_printing_char(line) -> str:
        return line.translate(non_printable_map)

    return replace_non_printing_char

replace_nonprint = get_non_printing_char_replacer(" ")

def preproc(text):
    clean = mpn.normalize(text)
    clean = replace_nonprint(clean)
    # replace 𝓕𝔯𝔞𝔫𝔠𝔢𝔰𝔠𝔞 by Francesca
    clean = unicodedata.normalize("NFKC", clean)
    return clean

def preprocess_function(examples):
    inputs = [preproc(example[source_lang]) for example in examples["translation"]]
    targets = [preproc(example[target_lang]) for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True, padding="max_length")
    # Check for None values in input_ids and labels
    if None in model_inputs["input_ids"] or None in model_inputs["labels"]:
        print("Warning: None values found in tokenized output")
        # Remove examples with None values
        valid_indices = [i for i, (inp, lab) in enumerate(zip(model_inputs["input_ids"], model_inputs["labels"]))
                         if inp is not None and lab is not None]
        for key in model_inputs.keys():
            model_inputs[key] = [model_inputs[key][i] for i in valid_indices]
    return model_inputs

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    print(result)
    result = {"bleu": result["bleu"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [4]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint, src_lang=source_lang, tgt_lang=target_lang)
# Apply preprocessing to the dataset
tokenized_zds = zindi_ds.map(
    preprocess_function,
    batched=True,
    remove_columns=zindi_ds["train"].column_names  # Remove original columns
)



In [5]:
concat_ds = concatenate_datasets([tokenized_zds['train'], tokenized_zds['test']])

In [6]:
concat_ds

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 9458
})

In [7]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [8]:
import gc, torch
gc.collect()
torch.cuda.empty_cache()
torch.cuda.synchronize()

In [9]:

training_args = Seq2SeqTrainingArguments(
    output_dir="models/nllb/nllb_output",
    eval_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=20,
    per_device_eval_batch_size=20,
    weight_decay=0.01,
    num_train_epochs=200000,
    predict_with_generate=True,
    fp16=True,
    # push_to_hub=False,
    do_train=True,
    do_eval=True,
    gradient_accumulation_steps=10,
    logging_dir= "models/nllb/nllb_output/logs",
    logging_steps = 10,
    save_strategy = 'steps',
    save_steps = 100,
    save_total_limit = 20,
    seed = 42,
    dataloader_drop_last = False,
    eval_steps = 1,
    # label_smoothing_factor: float = 0.0,
    # optim: Union[transformers.training_args.OptimizerNames, str] = 'adamw_torch',
    # resume_from_checkpoint: Optional[str] = None,
    # fp16_backend: str = 'auto',
    # batch_eval_metrics: bool = False,
    # eval_on_start=True,
    generation_max_length= 128,
    generation_num_beams=2,
    # generation_config: Union[str, pathlib.Path, transformers.generation.configuration_utils.GenerationConfig, NoneType] = None,
    
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=concat_ds,
    eval_dataset=tokenized_zds["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
