In [43]:
# %cd /home/rana/Projects/zindi
%cd /root/zindi/
import yaml
import json
import os
import shutil

from transformers import AutoModelForSeq2SeqLM
with open('common/config.yaml', 'r') as f:
    config = yaml.load(f, Loader=yaml.SafeLoader)

def delete_file_if_exists(file_path):
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"File {file_path} has been deleted.")
    else:
        print(f"File {file_path} does not exist.")


def copy_file_or_directory(source, destination):
    try:
        if os.path.isfile(source):
            # If source is a file, copy it directly
            shutil.copy2(source, destination)
            print(f"File copied successfully from {source} to {destination}")
        elif os.path.isdir(source):
            # If source is a directory, copy all files within it
            if not os.path.exists(destination):
                os.makedirs(destination)
            for item in os.listdir(source):
                s = os.path.join(source, item)
                d = os.path.join(destination, item)
                if os.path.isfile(s):
                    shutil.copy2(s, d)
                    print(f"File copied successfully from {s} to {d}")
        else:
            print(f"Source {source} is neither a file nor a directory.")
    except FileNotFoundError:
        print(f"Source not found: {source}")
    except PermissionError:
        print("Permission denied. Check file permissions.")
    except shutil.SameFileError:
        print("Source and destination are the same file.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

def create_or_clean_directory(path):
    if os.path.exists(path):
        # Path exists, so clean it
        for item in os.listdir(path):
            item_path = os.path.join(path, item)
            if os.path.isfile(item_path):
                os.unlink(item_path)
            elif os.path.isdir(item_path):
                shutil.rmtree(item_path)
        print(f"Cleaned existing directory: {path}")
    else:
        # Path doesn't exist, so create it
        os.makedirs(path)
        print(f"Created new directory: {path}")

/root/zindi


In [44]:
#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
# !huggingface-cli login

In [45]:
#Device specific params
os.environ['model_name_or_path'] = "models/marian/marian_output/base_model"

In [46]:
# first = True
first = False

# base_model_path = "models/marian/marian_output/base_model"
custom_tokenizer="tokenizer_custom/combined_pad_V1"

create_or_clean_directory(os.environ['model_name_or_path'])
if first:
    # Download the model
    # Load model:
    model = AutoModelForSeq2SeqLM.from_pretrained(config.get('model_name_or_path'))
    model.save_pretrained(os.environ['model_name_or_path'])
    delete_file_if_exists(os.environ['model_name_or_path']+'/generation_config.json')
    copy_file_or_directory(custom_tokenizer, os.environ['model_name_or_path'])
else:
    # Copy checkpoint to base model path
    copy_file_or_directory(config.get('model_name_or_path'), os.environ['model_name_or_path'])

Cleaned existing directory: models/marian/marian_output/base_model
File copied successfully from /root/zindi/models/marian/marian_output/checkpoint-400_copy/config.json to models/marian/marian_output/base_model/config.json
File copied successfully from /root/zindi/models/marian/marian_output/checkpoint-400_copy/generation_config.json to models/marian/marian_output/base_model/generation_config.json


File copied successfully from /root/zindi/models/marian/marian_output/checkpoint-400_copy/model.safetensors to models/marian/marian_output/base_model/model.safetensors
File copied successfully from /root/zindi/models/marian/marian_output/checkpoint-400_copy/optimizer.pt to models/marian/marian_output/base_model/optimizer.pt
File copied successfully from /root/zindi/models/marian/marian_output/checkpoint-400_copy/rng_state.pth to models/marian/marian_output/base_model/rng_state.pth
File copied successfully from /root/zindi/models/marian/marian_output/checkpoint-400_copy/scheduler.pt to models/marian/marian_output/base_model/scheduler.pt
File copied successfully from /root/zindi/models/marian/marian_output/checkpoint-400_copy/source.spm to models/marian/marian_output/base_model/source.spm
File copied successfully from /root/zindi/models/marian/marian_output/checkpoint-400_copy/special_tokens_map.json to models/marian/marian_output/base_model/special_tokens_map.json
File copied successful

In [47]:
import gc
# Modify the model config
config_path = os.environ['model_name_or_path']+'/config.json'
with open(config_path, 'r') as f:
    data = json.load(f)
    # data['decoder_attention_heads']=4
    # data['decoder_ffn_dim']=1024
    # data['decoder_layers']=6
    data['dropout']=0.75
    # data['encoder_attention_heads']=4
    # data['encoder_ffn_dim']=1024
    # data['encoder_layers']=6
    # data['max_position_embeddings']=512
    # data['num_hidden_layers']=6
    # data['torch_dtype']="float32"

    data['max_length']=128
    data['num_beams']=2
    data["bos_token_id"]= 1
    data["eos_token_id"]= 2
    data["forced_eos_token_id"]= 2
    data["bad_words_ids"]= [[31999]]
    data["decoder_start_token_id"]= 31999
    data['decoder_vocab_size']=32000
    data["pad_token_id"]= 31999
    data['vocab_size']=32000

with open(config_path, 'w') as f:
    json.dump(data, f, indent=2)    

# Load model with updated config and save it
model = AutoModelForSeq2SeqLM.from_pretrained(os.environ['model_name_or_path'], ignore_mismatched_sizes=True)
# Save model
model.save_pretrained(os.environ['model_name_or_path'])

del model
gc.collect()

Non-default generation parameters: {'max_length': 128, 'num_beams': 2, 'bad_words_ids': [[31999]], 'forced_eos_token_id': 2}


527

In [51]:
os.environ['dataset_name'] = "uvci/Koumankan_mt_dyu_fr"
os.environ['generation_config'] = os.environ['model_name_or_path']+'/generation_config.json'
os.environ['source_lang']="dyu"
os.environ['target_lang']="fr"
os.environ['dataset_config_name']= "default"
os.environ['per_device_eval_batch_size']="480"
os.environ['per_device_train_batch_size']="480"
# use_cpu=False
os.environ['save_steps']="100"
os.environ['eval_steps']="100"
os.environ['num_train_epochs']="10000"
os.environ['logging_steps']="10"
os.environ['save_total_limit']="10"
os.environ['overwrite_output_dir']="True"
os.environ['run_name']="marian-1"
os.environ['output_dir']="models/marian/marian_output"
os.environ['logging_dir']="models/marian/logs"
os.environ['predict_with_generate']="True"
os.environ['dataloader_drop_last']="True"
os.environ['jit_mode_eval']="False"
# os.environ['do_eval']="True"
os.environ['do_predict']="False"
os.environ['do_train']="True"

### Config
os.environ['label_smoothing_factor']="0.00001"
os.environ['learning_rate']="1e-04"
os.environ['gradient_accumulation_steps']="4"
os.environ['generation_max_length']="128"
os.environ['generation_num_beams']="2"
os.environ['max_source_length']= "128"
os.environ['warmup_steps']="10"
os.environ['weight_decay']="0.00001"
os.environ['seed']="42"
os.environ['fp16']="False"
os.environ['fp16_backend']="auto"
os.environ['fp16_full_eval']="False"
os.environ['full_determinism']="True"


In [52]:
# from transformers import GenerationConfig

# # Create a custom generation config
# custom_gen_config = GenerationConfig(
#     bad_words_ids=[[31999]],
#     bos_token_id=1,
#     decoder_start_token_id=31999,
#     eos_token_id=2,
#     forced_eos_token_id=2,
#     pad_token_id=31999,
#     num_beams=1,
#     max_length=128
#     # Add any other parameters you want to override
# )

# custom_gen_config.save_pretrained(base_model_path, "generation_config.json")
# os.environ['generation_config']=base_model_path+"/generation_config.json"

In [53]:
!python transformers/examples/pytorch/translation/run_translation.py \
--per_device_train_batch_size $$per_device_train_batch_size \
--per_device_eval_batch_size $$per_device_eval_batch_size \
--save_steps $$save_steps \
--num_train_epochs $$num_train_epochs \
--logging_steps $$logging_steps \
--label_smoothing_factor $$label_smoothing_factor \
--learning_rate $$learning_rate \
--run_name $$run_name \
--output_dir $$output_dir \
--logging_dir $$logging_dir \
--eval_steps $$eval_steps \
--gradient_accumulation_steps $$gradient_accumulation_steps \
--model_name_or_path  $$model_name_or_path  \
--dataset_name  $$dataset_name  \
--generation_max_length $$generation_max_length \
--generation_num_beams $$generation_num_beams \
--source_lang $$source_lang \
--target_lang $$target_lang \
--dataset_config_name $$dataset_config_name \
--predict_with_generate $$predict_with_generate \
--max_source_length $$max_source_length \
--dataloader_drop_last $$dataloader_drop_last \
--warmup_steps $$warmup_steps \
--weight_decay $$weight_decay \
--save_total_limit $$save_total_limit \
--seed $$seed \
--overwrite_output_dir $$overwrite_output_dir \
--jit_mode_eval $$jit_mode_eval \
--do_train $$do_train \
--fp16 $$fp16 \
--fp16_backend $$fp16_backend \
--fp16_full_eval $$fp16_full_eval \
--full_determinism $$full_determinism \
--predict_with_generate true \
--do_eval true \
--do_predict true \
--eval_strategy steps \
--generation_config $$generation_config 
# --resume_from_checkpoint {resume_from_checkpoint} 
# --use_cpu {use_cpu} 

08/16/2024 07:05:06 - INFO - __main__ - Training/evaluation parameters Seq2SeqTrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=True,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=True,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=100,
eval_s