In [1]:
# %cd /home/rana/Projects/zindi
%cd /root/zindi/
import yaml
import json
import os   
import shutil
import evaluate
import numpy as np
metric = evaluate.load("bleu")
from transformers import AutoModelForSeq2SeqLM
with open('common/config.yaml', 'r') as f:
    config = yaml.load(f, Loader=yaml.SafeLoader)

def delete_file_if_exists(file_path):
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"File {file_path} has been deleted.")
    else:
        print(f"File {file_path} does not exist.")


def copy_file_or_directory(source, destination):
    try:
        if os.path.isfile(source):
            # If source is a file, copy it directly
            shutil.copy2(source, destination)
            print(f"File copied successfully from {source} to {destination}")
        elif os.path.isdir(source):
            # If source is a directory, copy all files within it
            if not os.path.exists(destination):
                os.makedirs(destination)
            for item in os.listdir(source):
                s = os.path.join(source, item)
                d = os.path.join(destination, item)
                if os.path.isfile(s):
                    shutil.copy2(s, d)
                    print(f"File copied successfully from {s} to {d}")
        else:
            print(f"Source {source} is neither a file nor a directory.")
    except FileNotFoundError:
        print(f"Source not found: {source}")
    except PermissionError:
        print("Permission denied. Check file permissions.")
    except shutil.SameFileError:
        print("Source and destination are the same file.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

def create_or_clean_directory(path):
    if os.path.exists(path):
        # Path exists, so clean it
        for item in os.listdir(path):
            item_path = os.path.join(path, item)
            if os.path.isfile(item_path):
                os.unlink(item_path)
            elif os.path.isdir(item_path):
                shutil.rmtree(item_path)
        print(f"Cleaned existing directory: {path}")
    else:
        # Path doesn't exist, so create it
        os.makedirs(path)
        print(f"Created new directory: {path}")


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


/root/zindi


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def safe_bleu_compute(bleu, predictions, references):
    try:
        return bleu.compute(predictions=predictions, references=references)
    except ZeroDivisionError:
        return {"bleu": 0.0, "precisions": [0.0, 0.0, 0.0, 0.0], "brevity_penalty": 0.0, "length_ratio": 0.0, "translation_length": 0, "reference_length": 0}

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    # print(decoded_preds, decoded_labels)
    result = safe_bleu_compute(metric, predictions=decoded_preds, references=decoded_labels)
    # result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    # print(result)
    result = {"bleu": result["bleu"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

def preprocess_function(examples):
    inputs = [example["dyu"] for example in examples["translation"]]
    targets = [example["fr"] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=48, truncation=True, padding="max_length")
    # Check for None values in input_ids and labels
    if None in model_inputs["input_ids"] or None in model_inputs["labels"]:
        print("Warning: None values found in tokenized output")
        # Remove examples with None values
        valid_indices = [i for i, (inp, lab) in enumerate(zip(model_inputs["input_ids"], model_inputs["labels"]))
                         if inp is not None and lab is not None]
        for key in model_inputs.keys():
            model_inputs[key] = [model_inputs[key][i] for i in valid_indices]
    return model_inputs

In [3]:
#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
# !huggingface-cli login

In [4]:
#Device specific params
import os
os.environ['model_name_or_path'] = "models/marian/marian_output/base_model"

In [6]:
# # Get Marian tokenizer
# from transformers import AutoTokenizer
# tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-af-fr", src_lang="af", tgt_lang="fr")
# tokenizer.save_pretrained("tokenizer_custom/marian")

In [5]:
import gc
from transformers import GenerationConfig, MarianConfig, AutoTokenizer
first = True
# first = False

# base_model_path = "models/marian/marian_output/base_model"
# custom_tokenizer="tokenizer_custom/nllb"

create_or_clean_directory(os.environ['model_name_or_path'])
if first:
    # Download the model
    # Load & save model:
    model = AutoModelForSeq2SeqLM.from_pretrained(config.get('model_name_or_path'))
    model.save_pretrained(os.environ['model_name_or_path'])
    #Overwrite Generation Config
    delete_file_if_exists(os.environ['model_name_or_path']+'/generation_config.json')
    generation_config = GenerationConfig(
    bad_words_ids = [[31999]],
    bos_token_id = 1,
    decoder_start_token_id = 31999,
    eos_token_id= 2,
    forced_eos_token_id= 2,
    max_length= 48,
    num_beams= 1,
    pad_token_id= 31999,
    )
    generation_config.save_pretrained(os.environ['model_name_or_path'] )
    #Overwrite Model Config
    mconfig = MarianConfig.from_pretrained(os.environ['model_name_or_path'])
    mconfig.dropout=0.5
    mconfig.max_length=48
    mconfig.num_beams=1
    mconfig.vocab_size = 32000
    mconfig.bad_words_ids = [[31999]]
    mconfig.decoder_start_token_id = 1
    mconfig.forced_eos_token_id=2
    mconfig.decoder_vocab_size = 32000
    mconfig.pad_token_id=31999
    mconfig.save_pretrained(os.environ['model_name_or_path'] )
    #Overwrite Toeknizer
    tokenizer = AutoTokenizer.from_pretrained("tokenizer_custom/combined_pad_V2", src_lang="dyu", tgt_lang="fr")
    tokenizer.save_pretrained(os.environ['model_name_or_path'])  
    # Reload model
    model = AutoModelForSeq2SeqLM.from_pretrained(os.environ['model_name_or_path'],ignore_mismatched_sizes=True)
else:
    # Copy checkpoint to base model path
    copy_file_or_directory(config.get('model_name_or_path'), os.environ['model_name_or_path'])

Cleaned existing directory: models/marian/marian_output/base_model


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59421]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 48, 'bad_words_ids': [[31999]], 'forced_eos_token_id': 0}


File models/marian/marian_output/base_model/generation_config.json has been deleted.


Some weights of MarianMTModel were not initialized from the model checkpoint at models/marian/marian_output/base_model and are newly initialized because the shapes did not match:
- final_logits_bias: found shape torch.Size([1, 59422]) in the checkpoint and torch.Size([1, 32000]) in the model instantiated
- model.shared.weight: found shape torch.Size([59422, 512]) in the checkpoint and torch.Size([32000, 512]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=os.environ['model_name_or_path'])

In [7]:
from datasets import load_dataset,concatenate_datasets
zindi_ds = load_dataset("uvci/Koumankan_mt_dyu_fr")
tokenized_zds = zindi_ds.map(
    preprocess_function,
    batched=True,
    remove_columns=zindi_ds["train"].column_names  # Remove original columns
)
concat_ds = concatenate_datasets([tokenized_zds['train'], tokenized_zds['test']])

Map: 100%|██████████| 8065/8065 [00:00<00:00, 10764.53 examples/s]
Map: 100%|██████████| 1471/1471 [00:00<00:00, 10273.36 examples/s]
Map: 100%|██████████| 1393/1393 [00:00<00:00, 14206.43 examples/s]


In [9]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
training_args = Seq2SeqTrainingArguments(
    output_dir="models/marian/marian_output",
    eval_strategy="epoch",
    eval_steps = 1,
    learning_rate=1e-4,
    per_device_train_batch_size=100,
    per_device_eval_batch_size=100,
    # weight_decay=0.01,
    num_train_epochs=200000,
    predict_with_generate=True,
    # fp16=True,
    # push_to_hub=False,
    do_train=True,
    do_eval=True,
    gradient_accumulation_steps=20,
    logging_dir= "models/marian/marian_output/logs",
    logging_steps = 1,
    save_strategy = 'epoch',
    save_steps = 2,
    save_total_limit = 3,
    seed = 42,
    dataloader_drop_last = False,
    # label_smoothing_factor: float = 0.0,
    optim = 'adafactor',
    # resume_from_checkpoint: Optional[str] = None,
    # fp16_backend: str = 'auto',
    # batch_eval_metrics: bool = False,
    # eval_on_start=True,
    # generation_max_length= 50,
    # generation_num_beams=1,
    generation_config = "models/marian/marian_output/base_model/generation_config.json",
    run_name="MTest1"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=concat_ds,
    eval_dataset=tokenized_zds["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
0,3.5568,3.015909,0.0,1.1829
1,2.0694,5.800525,0.0,47.0
2,1.8884,1.88094,0.0,18.2617


Non-default generation parameters: {'max_length': 48, 'bad_words_ids': [[31999]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 48, 'bad_words_ids': [[31999]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 48, 'bad_words_ids': [[31999]], 'forced_eos_token_id': 0}


In [None]:
# os.environ['dataset_name'] = "uvci/Koumankan_mt_dyu_fr"
# os.environ['generation_config'] = os.environ['model_name_or_path']+'/generation_config.json'
# os.environ['source_lang']="dyu_Latn"
# os.environ['target_lang']="fra_Latn"
# os.environ['dataset_config_name']= "default"
# os.environ['per_device_eval_batch_size']="1024"
# os.environ['per_device_train_batch_size']="1024"
# # use_cpu=False
# os.environ['save_steps']="100"
# os.environ['eval_steps']="100"
# os.environ['num_train_epochs']="10000"
# os.environ['logging_steps']="10"
# os.environ['save_total_limit']="10"
# os.environ['overwrite_output_dir']="True"
# os.environ['run_name']="marian-1"
# os.environ['output_dir']="models/marian/marian_output"
# os.environ['logging_dir']="models/marian/logs"
# os.environ['predict_with_generate']="True"
# os.environ['dataloader_drop_last']="True"
# os.environ['jit_mode_eval']="False"
# # os.environ['do_eval']="True"
# os.environ['do_predict']="False"
# os.environ['do_train']="True"

# ### Config
# os.environ['label_smoothing_factor']="0.00001"
# os.environ['learning_rate']="5e-04"
# os.environ['gradient_accumulation_steps']="4"
# os.environ['generation_max_length']="128"
# os.environ['generation_num_beams']="2"
# os.environ['max_source_length']= "128"
# os.environ['warmup_steps']="10"
# os.environ['weight_decay']="0.00001"
# os.environ['seed']="42"
# os.environ['fp16']="False"
# os.environ['fp16_backend']="auto"
# os.environ['fp16_full_eval']="False"
# os.environ['full_determinism']="True"


In [None]:
# !python transformers/examples/pytorch/translation/run_translation.py \
# --per_device_train_batch_size $$per_device_train_batch_size \
# --per_device_eval_batch_size $$per_device_eval_batch_size \
# --save_steps $$save_steps \
# --num_train_epochs $$num_train_epochs \
# --logging_steps $$logging_steps \
# --label_smoothing_factor $$label_smoothing_factor \
# --learning_rate $$learning_rate \
# --run_name $$run_name \
# --output_dir $$output_dir \
# --logging_dir $$logging_dir \
# --eval_steps $$eval_steps \
# --gradient_accumulation_steps $$gradient_accumulation_steps \
# --model_name_or_path  $$model_name_or_path  \
# --dataset_name  $$dataset_name  \
# --generation_max_length $$generation_max_length \
# --generation_num_beams $$generation_num_beams \
# --source_lang $$source_lang \
# --target_lang $$target_lang \
# --dataset_config_name $$dataset_config_name \
# --predict_with_generate $$predict_with_generate \
# --max_source_length $$max_source_length \
# --dataloader_drop_last $$dataloader_drop_last \
# --warmup_steps $$warmup_steps \
# --weight_decay $$weight_decay \
# --save_total_limit $$save_total_limit \
# --seed $$seed \
# --overwrite_output_dir $$overwrite_output_dir \
# --jit_mode_eval $$jit_mode_eval \
# --do_train $$do_train \
# --fp16 $$fp16 \
# --fp16_backend $$fp16_backend \
# --fp16_full_eval $$fp16_full_eval \
# --full_determinism $$full_determinism \
# --predict_with_generate true \
# --do_eval $$do_eval \
# --do_predict $$do_predict \
# --eval_strategy steps \
# --generation_config $$generation_config 
# # --resume_from_checkpoint {resume_from_checkpoint} 
# # --use_cpu {use_cpu} 

08/17/2024 16:11:17 - INFO - __main__ - Training/evaluation parameters Seq2SeqTrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=True,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=100,
eval_