In [1]:
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoConfig, GenerationConfig
from transformers.integrations import TensorBoardCallback
import evaluate
import numpy as np
metric = evaluate.load("bleu")
source_lang = "dyu_Latn"
target_lang = "fra_Latn"
checkpoint = "facebook/nllb-200-distilled-600M"
zindi_ds = load_dataset("uvci/Koumankan_mt_dyu_fr")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#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
#huggingface-cli login

In [3]:
import re
import sys
import unicodedata
from sacremoses import MosesPunctNormalizer

mpn = MosesPunctNormalizer(lang="fr")
mpn.substitutions = [
    (re.compile(r), sub) for r, sub in mpn.substitutions
]

def get_non_printing_char_replacer(replace_by: str = " "):
    non_printable_map = {
        ord(c): replace_by
        for c in (chr(i) for i in range(sys.maxunicode + 1))
        # same as \p{C} in perl
        # see https://www.unicode.org/reports/tr44/#General_Category_Values
        if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"}
    }

    def replace_non_printing_char(line) -> str:
        return line.translate(non_printable_map)

    return replace_non_printing_char

replace_nonprint = get_non_printing_char_replacer(" ")

def preproc(text):
    clean = mpn.normalize(text)
    clean = replace_nonprint(clean)
    # replace 𝓕𝔯𝔞𝔫𝔠𝔢𝔰𝔠𝔞 by Francesca
    clean = unicodedata.normalize("NFKC", clean)
    return clean

def preprocess_function(examples):
    inputs = [preproc(example["dyu"]) for example in examples["translation"]]
    targets = [preproc(example["fr"]) for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=50, truncation=True, padding="max_length")
    # Check for None values in input_ids and labels
    if None in model_inputs["input_ids"] or None in model_inputs["labels"]:
        print("Warning: None values found in tokenized output")
        # Remove examples with None values
        valid_indices = [i for i, (inp, lab) in enumerate(zip(model_inputs["input_ids"], model_inputs["labels"]))
                         if inp is not None and lab is not None]
        for key in model_inputs.keys():
            model_inputs[key] = [model_inputs[key][i] for i in valid_indices]
    return model_inputs

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    print(result)
    result = {"bleu": result["bleu"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [4]:
# source_lang = "dyu_Latn"
# target_lang = "fra_Latn"

tokenizer = AutoTokenizer.from_pretrained(checkpoint, src_lang=source_lang, tgt_lang=target_lang)
# Apply preprocessing to the dataset
tokenized_zds = zindi_ds.map(
    preprocess_function,
    batched=True,
    remove_columns=zindi_ds["train"].column_names  # Remove original columns
)

Map: 100%|██████████| 8065/8065 [00:02<00:00, 2795.91 examples/s]
Map: 100%|██████████| 1471/1471 [00:00<00:00, 3480.60 examples/s]
Map: 100%|██████████| 1393/1393 [00:00<00:00, 3696.32 examples/s]


In [5]:
concat_ds = concatenate_datasets([tokenized_zds['train'], tokenized_zds['test']])

In [6]:
# ### Max len of 50 is enough
# def length_excluding_terminating_ones(list_of_lists):
#     lengths = []
#     for lst in list_of_lists:
#         # Reverse the list and find the first occurrence of a number not equal to 1
#         index = next((i for i, x in enumerate(reversed(lst)) if x != 1), len(lst))
#         # Calculate the length excluding the trailing 1s
#         lengths.append(len(lst) - index)
#     return lengths
# max(length_excluding_terminating_ones(tokenized_zds['validation']['input_ids']))

In [7]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [8]:
# M2M100Config {
#   "_name_or_path": "facebook/nllb-200-distilled-600M",
#   "activation_dropout": 0.0,
#   "activation_function": "relu",
#   "architectures": [
#     "M2M100ForConditionalGeneration"
#   ],
#   "attention_dropout": 0.1,
#   "bos_token_id": 0,
#   "d_model": 1024,
#   "decoder_attention_heads": 16,
#   "decoder_ffn_dim": 4096,
#   "decoder_layerdrop": 0,
#   "decoder_layers": 12,
#   "decoder_start_token_id": 2,
#   "dropout": 0.1,
#   "encoder_attention_heads": 16,
#   "encoder_ffn_dim": 4096,
#   "encoder_layerdrop": 0,
#   "encoder_layers": 12,
#   "eos_token_id": 2,
#   "init_std": 0.02,
#   "is_encoder_decoder": true,
#   "max_length": 200,
#   "max_position_embeddings": 1024,
#   "model_type": "m2m_100",
#   "num_hidden_layers": 12,
#   "pad_token_id": 1,
#   "scale_embedding": true,
#   "tokenizer_class": "NllbTokenizer",
#   "torch_dtype": "float32",
#   "transformers_version": "4.45.0.dev0",
#   "use_cache": true,
#   "vocab_size": 256206
# }



In [9]:
!rm -rf models/nllb/nllb_output/base_model

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [10]:
import gc
# GenerationConfig {
#   "bos_token_id": 0,
#   "decoder_start_token_id": 2,
#   "eos_token_id": 2,
#   "max_length": 200,
#   "pad_token_id": 1
# }
# Create a GenerationConfig object

#Load model and config
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
config = AutoConfig.from_pretrained(checkpoint)

#Save model,config and tokenizer
model.save_pretrained('models/nllb/nllb_output/base_model')
tokenizer.save_pretrained('models/nllb/nllb_output/base_model')

#Update config
config.dropout=0.5
config.max_length=50
config.save_pretrained('models/nllb/nllb_output/base_model')

#Update generation config
generation_config = GenerationConfig(
  bos_token_id= 0,
  decoder_start_token_id= 2,
  eos_token_id= 2,
  max_length= 50,
  pad_token_id= 1
)
generation_config.save_pretrained('models/nllb/nllb_output/base_model')
del model
gc.collect()

#Reload model
model = AutoModelForSeq2SeqLM.from_pretrained('models/nllb/nllb_output/base_model')

Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 50}


In [11]:
# import gc, torch
# gc.collect()
# torch.cuda.empty_cache()
# torch.cuda.synchronize()

RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [14]:

training_args = Seq2SeqTrainingArguments(
    output_dir="models/nllb/nllb_output",
    eval_strategy="steps",
    learning_rate=1e-4,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    # weight_decay=0.01,
    num_train_epochs=200000,
    predict_with_generate=True,
    fp16=True,
    # push_to_hub=False,
    do_train=True,
    do_eval=True,
    gradient_accumulation_steps=10,
    logging_dir= "models/nllb/nllb_output/logs",
    logging_steps = 1,
    save_strategy = 'steps',
    save_steps = 100,
    save_total_limit = 3,
    seed = 42,
    dataloader_drop_last = False,
    eval_steps = 1,
    # label_smoothing_factor: float = 0.0,
    optim = 'adafactor',
    # resume_from_checkpoint: Optional[str] = None,
    # fp16_backend: str = 'auto',
    # batch_eval_metrics: bool = False,
    # eval_on_start=True,
    # generation_max_length= 50,
    generation_num_beams=2,
    generation_config = "models/nllb/nllb_output/base_model/generation_config.json",
    run_name="Test2"
    
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=concat_ds,
    eval_dataset=tokenized_zds["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 3/189000000 [10:31<11050310:16:31, 210.48s/it]
                                                             
[A                                                          

  0%|          | 1/189000000 [01:28<4630492:35:34, 88.20s/it]
[A

{'loss': 10.1952, 'grad_norm': 6.939687728881836, 'learning_rate': 9.999999947089948e-05, 'epoch': 0.0}




[A[A

[A[A

[A[A

[A[A

In [10]:
trainer.save_model("/root/zindi/models/nllb/nllb_output/checkpoint-300")

Non-default generation parameters: {'max_length': 100}
