<a href="https://colab.research.google.com/github/Saputoa21/Machine-Translation/blob/main/finetune_mBART_AdvancedMT2025W.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!pip install transformers[sentencepiece]==4.57.2   # 4.44.2
!pip install datasets==2.19.0
!pip install evaluate==0.4.3
#!pip install accelerate==0.34.2
!pip install sacrebleu==2.4.3



In [None]:
!pip install -q "transformers[sentencepiece]>=4.47.0"   # newer version, has modeling_layers
!pip install -q "datasets==2.19.0"
!pip install -q "evaluate==0.4.3"
!pip install -q "accelerate==0.34.2"
!pip install -q "sacrebleu==2.4.3"

In [None]:
#txt to json
import sys
import json
import re
import codecs

# json from huggingface
#{ "translation": { "en": "Others have dismissed him as a joke.", "ro": "Alții l-au numit o glumă." } }
#{ "translation": { "en": "And some are holding out for an implosion.", "ro": "Iar alții așteaptă implozia." } }

def txt2json(src_id, trg_id, src_file, trg_file, out_file):

  src = codecs.open(src_file, 'r', encoding="utf-8")
  trg = codecs.open(trg_file, 'r', encoding="utf-8")
  out_json = codecs.open(out_file, 'w', encoding="utf-8")


  src_lines = []
  trg_lines = []
  for line_s, line_t in zip(src, trg):
      line_s = line_s.strip()
      line_t = line_t.strip()
      src_lines.append(line_s)
      trg_lines.append(line_t)
  recs = [src_lines, trg_lines]
  for src, tgt in zip(*recs):
      out = {"translation": { src_id: src, trg_id: tgt } }
      x = json.dumps(out, indent=0, ensure_ascii=False) #to meake json be preocesses in UTF-8
      x = re.sub(r'\n', ' ', x, 0, re.M)
      out_json.write(x + "\n")
  out_json.close()
  return

In [None]:
#lang_pair = "en-de"

train_src = "train.en-de.de-filtered.de.semantic.en"
train_trg = "train.en-de.en-filtered.en.semantic.de"
train_json = "train.en-de.json"

txt2json('en_XX', 'de_DE', train_src, train_trg, train_json) #language annotations from mBART

In [None]:
import torch
from transformers import (
    CONFIG_MAPPING,
    MODEL_MAPPING,
    AdamW,
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    MBartTokenizer,
    MBart50Tokenizer,
    MBartTokenizerFast,
    MBart50TokenizerFast,
    SchedulerType,
    default_data_collator,
    get_scheduler,
    set_seed,
)
from datasets import load_dataset, load_metric, DatasetDict
import transformers
import os
import numpy as np

MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
MULTILINGUAL_TOKENIZERS = [MBartTokenizer, MBartTokenizerFast, MBart50Tokenizer, MBart50TokenizerFast]


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

def main():

    model_id = "facebook/mbart-large-50-one-to-many-mmt" #en-XX XX-en XX-XX
    max_length = 100

    code2lang = {
    "de": "German",
    "fr": "French",
    "en": "English",
    "nl": "Dutch",
    "pt": "Portuguese",
    "ru": "Russian",
    "zh": "Chinese",
    "ro": "Romanian",
    "es": "Spanish"
    }
    source_code = 'en_XX'
    target_code = 'de_DE'
    forced_bos_token = 'de_DE' #TODO!!!??? or empty str""
    data_files = "train.en-de.json"
    #https://arxiv.org/pdf/2312.12740.pdf trainig size 20k
    output_dir = 'models/mbart50-full-finetune-1epoch-1e4'
    train_bs = 1
    grad_acc = 4
    lr = 1e-4
    w_steps = 0.03
    n_epoch = 1
    lr_scheduler_type = "linear"
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_id,
                                                  device_map="auto") #{"": 0}

    model.resize_token_embeddings(len(tokenizer))

    # Set decoder_start_token_id
    if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
      if isinstance(tokenizer, MBartTokenizer, MBartTokenizerFast):
          model.config.decoder_start_token_id = tokenizer.lang_code_to_id[target_code]
      else:
          model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids(target_code)


    if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)):

      tokenizer.src_lang = source_code
      tokenizer.tgt_lang = target_code

      # For multilingual translation models like mBART-50 and M2M100 we need to force the target language token
      # as the first generated token.
      forced_bos_token_id = (
          tokenizer.lang_code_to_id[forced_bos_token] if forced_bos_token is not None else None
      )
      model.config.forced_bos_token_id = forced_bos_token_id
      #print(tokenizer.src_lang, model.config.forced_bos_token_id)


    print_trainable_parameters(model)
    print(model)

    metric = load_metric('sacrebleu', trust_remote_code=True)

    def preprocess_parallel_function(examples):
      inputs = [ex[source_code] for ex in examples["translation"]]
      targets = [ex[target_code] for ex in examples["translation"]]
      #inputs = [prefix + inp for inp in inputs]
      #print(inputs, targets)
      model_inputs = tokenizer(inputs, max_length=max_length, padding=False, truncation=True)


      labels = tokenizer(targets, max_length=max_length, padding=False, truncation=True)

      #if padding == "max_length" and ignore_pad_token_for_loss:
      #labels["input_ids"] = [
      #    [(l if l != tokenizer.pad_token_id else -100) for l in label]
      #    for label in labels["input_ids"]]



      model_inputs["labels"] = labels["input_ids"]

      return model_inputs

    def postprocess_text(preds, labels):
      preds = [pred.strip() for pred in preds]
      labels = [[label.strip()] for label in labels]

      return preds, labels

    def compute_metrics(eval_preds, ignore_pad_token_for_loss=False):
      preds, labels = eval_preds
      if isinstance(preds, tuple):
        preds = preds[0]
      decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
      # Replace -100 in the labels as we can't decode them.
      labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
      decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
      # Some simple post-processing
      decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

      result = metric.compute(predictions=decoded_preds, references=decoded_labels)
      prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
      result = {'bleu' : result['score']}
      result["gen_len"] = np.mean(prediction_lens)
      result = {k: round(v, 4) for k, v in result.items()}
      return result

    #how to do the test
    data = load_dataset("json", data_files=data_files)
    data_split_tmp = data['train'].train_test_split(test_size=0.2, seed=42)
    data_split_valid_test = data_split_tmp['test'].train_test_split(test_size=0.5, seed=42)
    data = DatasetDict({'train': data_split_tmp['train'].select(range(0, 5000)), #check your credits!!!
                        'valid': data_split_valid_test['train'].select(range(0, 1000)),
                        'test':  data_split_valid_test['test'].select(range(0, 1000)),})
    data.save_to_disk("data_save_test")
    #data = load_from_disk(data_files)
    #then requires load_from_disk() instead of load _dataset!!!
    print(data)
    column_names = data["train"].column_names
    print(column_names)
    print(data.column_names)
    print(data.num_rows)
    data = data.map(preprocess_parallel_function,
                    batched=True)
    label_pad_token_id = -100 #TODO??

    trainer = transformers.Seq2SeqTrainer(
        model=model,
        train_dataset=data["train"],
        eval_dataset=data["valid"],
        args=transformers.Seq2SeqTrainingArguments(
            report_to='none', #turn off  wandb
            per_device_train_batch_size=train_bs, #4, 12
            gradient_accumulation_steps=grad_acc,
            do_train=True,
            do_eval=True,
            per_device_eval_batch_size=4,
            eval_accumulation_steps=4,
            warmup_ratio=w_steps,
            lr_scheduler_type=lr_scheduler_type,
            num_train_epochs=n_epoch, #5?
            predict_with_generate=True,
            metric_for_best_model='bleu',
            load_best_model_at_end=True,
            learning_rate=lr, #0.1, 0.01, 0.001
            save_total_limit=1,
            generation_num_beams=4,
            save_strategy="epoch",
            eval_strategy="epoch",
            output_dir=output_dir,
        ),
        data_collator=transformers.DataCollatorForSeq2Seq(tokenizer,
                                                          label_pad_token_id=label_pad_token_id,
                                                          model=model),
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
    )
    model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
    trainer.train()

    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)






KeyboardInterrupt: 

In [None]:
#fine-tune
main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


trainable params: 610879488 || all params: 610879488 || trainable%: 100.0
MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x MBartEncoderLayer(
          (self_attn): MBartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
      

  metric = load_metric('sacrebleu', trust_remote_code=True)


Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 5000
    })
    valid: Dataset({
        features: ['translation'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 1000
    })
})
['translation']
{'train': ['translation'], 'valid': ['translation'], 'test': ['translation']}
{'train': 5000, 'valid': 1000, 'test': 1000}


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,1.4298,1.182547,38.7965,30.576


Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_bos_token_id': 250003, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_bos_token_id': 250003, 'forced_eos_token_id': 2}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_bos_token_id': 250003, 'forced_eos_token_id': 2}


In [None]:
#use models
#import models and tokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_from_disk
tokenizer = AutoTokenizer.from_pretrained("models/mbart50-full-finetune-1epoch-1e4") #path of your directory with the finetuned

model = AutoModelForSeq2SeqLM.from_pretrained("models/mbart50-full-finetune-1epoch-1e4") #gpu!!!

#upload source file and read
import codecs
#file_name = "Vienna_Environmental.en-de.valid.en"
mt_output = []
output = open('mtouput_fullFT', 'w', encoding='utf-8')
data = load_from_disk("data_save_test")
print(data)
data['test']
#with codecs.open(file_name, 'r', 'utf-8') as src:
for line in data['test']: #trasn src and trg translation src and trg
    src = line['translation']['en_XX']
    trg = line['translation']['de_DE']
    #line = line.strip()
    encoded = tokenizer(src, return_tensors="pt")              #gpu?? tokenize
    generated_tokens = model.generate(**encoded,
                                      forced_bos_token_id=tokenizer.lang_code_to_id['de_DE']) #add beam search 4
    translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) #de tokenize
    print(translation[0], file=output)
    print(translation[0])


In [None]:
import gc
#del(model)
#gc.collect()
#torch.cuda.empty_cache()

# **Fine-tuning with my corpus**

In [None]:
#txt to json
import sys
import json
import re
import codecs

def txt2json(src_id, trg_id, src_file, trg_file, out_file):
  src = codecs.open(src_file, 'r', encoding="utf-8")
  trg = codecs.open(trg_file, 'r', encoding="utf-8")
  out_json = codecs.open(out_file, 'w', encoding="utf-8")
  src_lines = []
  trg_lines = []
  for line_s, line_t in zip(src, trg):
      line_s = line_s.strip()
      line_t = line_t.strip()
      src_lines.append(line_s)
      trg_lines.append(line_t)
  recs = [src_lines, trg_lines]
  for src, tgt in zip(*recs):
      out = {"translation": { src_id: src, trg_id: tgt } }
      x = json.dumps(out, indent=0, ensure_ascii=False) #to meake json be preocesses in UTF-8
      x = re.sub(r'\n', ' ', x, 0, re.M)
      out_json.write(x + "\n")
  out_json.close()
  return

In [None]:
#lang_pair = "en-ru"

train_src = "MultiUN.en-ru.en-filtered.en.semantic.en"
train_trg = "MultiUN.en-ru.ru-filtered.ru.semantic.ru"
train_json = "train.en-ru.json"

txt2json('en_XX', 'ru_RU', train_src, train_trg, train_json) #language annotations from mBART

In [None]:
src = codecs.open(train_src, 'r', encoding="utf-8")
i = 0
for line_s in src:
    print(line_s.strip())
    i += 1
    if i == 10:
      break

Status of the Supplementary Fund as at 30 June 2005
where R was surface recession at open exposure measured in µm.
During the beating an officer, whom the complainant knew by name, also entered the room and, while he did not take part in the abuse, he did not stop it.
It specified that the new text submitted (TRANS/WP.1/2004/8/Rev.2) took into account not only the earlier discussions WP.1 had conducted in this regard but also the many proposals for improvement transmitted by the Office of Legal Affairs in New York as well as the decision of the Economic Commission for Europe adopted at its sixtieth session (report E/ECE/1431, point 44 (d)) which appears below:
Calls on all parties to implement fully the Pretoria Agreement and reminds them that they have decided in the Pretoria Agreement to refer to the mediator, President Thabo Mbeki, any differences which may arise in the interpretation of any part of the agreement;
Annex 3 -Appendix 2
However, based on an open competition organized b

In [None]:
trg = codecs.open(train_trg, 'r', encoding="utf-8")
i = 0
for line in trg:
  print(line.strip())
  i += 1
  if i == 10:
      break

Состояние Дополнительного фонда на 30 июня 2005 года
где R представляло собой величину отступления поверхности в мкм при открытом воздействии.
Во время избиения еще один полицейский, имя которого заявителю известно, вошел в комнату, и, хотя сам он не принимал участия в избиении, он не остановил его.
Он уточнил, что в представленном новом тексте (TRANS/WP.1/2004/8/Rev.2) учтены не только итоги прежних дискуссий WР.1 по этому вопросу, но и многочисленные предложения об усовершенствовании текста, переданные Управлением по правовым вопросам в Нью-Йорке, а также следующее решение Европейской экономической комиссии, принятое на ее шестидесятой сессии (доклад Е/ЕСЕ/1431, пункт 44 d)):
призывает все стороны полностью осуществить Преторийское соглашение и напоминает им, что они договорились в Преторийском соглашении о том, что в случае возникновения различий в толковании любой части этого соглашения они проведут консультации с Посредником, функции которого выполняет президент Табо Мбеки;
Прилож

In [None]:
import pandas as pd
df = pd.read_json('train.en-ru.json', lines=True)
print(df.head())

                                         translation
0  {'en_XX': 'Status of the Supplementary Fund as...
1  {'en_XX': 'where R was surface recession at op...
2  {'en_XX': 'During the beating an officer, whom...
3  {'en_XX': 'It specified that the new text subm...
4  {'en_XX': 'Calls on all parties to implement f...


In [None]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83994 entries, 0 to 83993
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   translation  83994 non-null  object
dtypes: object(1)
memory usage: 656.3+ KB
None


## First attempt with the provided settings

In [None]:
!pip install --upgrade "transformers" "peft"



In [None]:
import transformers
print(transformers.__version__)

4.57.2


In [None]:
from torch.optim import AdamW

In [None]:
from datasets import load_from_disk
import torch
from transformers import (
    CONFIG_MAPPING,
    MODEL_MAPPING,
    #AdamW,
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    MBartTokenizer,
    MBart50Tokenizer,
    MBartTokenizerFast,
    MBart50TokenizerFast,
    SchedulerType,
    default_data_collator,
    get_scheduler,
    set_seed,
)
from datasets import load_dataset, load_metric, DatasetDict
import transformers
import os
import numpy as np

MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
MULTILINGUAL_TOKENIZERS = [MBartTokenizer, MBartTokenizerFast, MBart50Tokenizer, MBart50TokenizerFast]


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

def main():

    model_id = "facebook/mbart-large-50-one-to-many-mmt" #en-XX XX-en XX-XX
    max_length = 100 #to compute less = nu,ber of tokens in a sentence


   # code2lang = {
   # "en": "English",
   # "ru": "Russian",
   # }
   #not needed as we have out own annotations for mBART

    source_code = 'en_XX'
    target_code = 'ru_RU'
    forced_bos_token = 'ru_RU' #for the beginning-of-sentence-token
    data_files = "train.en-ru.json"
    #https://arxiv.org/pdf/2312.12740.pdf trainig size 20k
    output_dir = 'models/mbart50-full-finetune-1epoch-1e4'
    train_bs = 1
    grad_acc = 4
    lr = 1e-4
    w_steps = 0.03
    n_epoch = 1
    lr_scheduler_type = "linear"
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = AutoModelForSeq2SeqLM.from_pretrained(model_id).to(device) #set to GPU

    model.resize_token_embeddings(len(tokenizer))

    # Set decoder_start_token_id
    if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
      if isinstance(tokenizer, MBartTokenizer, MBartTokenizerFast):
          model.config.decoder_start_token_id = tokenizer.lang_code_to_id[target_code]
      else:
          model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids(target_code)


    if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)):

      tokenizer.src_lang = source_code
      tokenizer.tgt_lang = target_code

      # For multilingual translation models like mBART-50 and M2M100 we need to force the target language token
      # as the first generated token.
      # as in the lecture with <2ja> for translation into Japanese
      forced_bos_token_id = (
          tokenizer.lang_code_to_id[forced_bos_token] if forced_bos_token is not None else None
      )
      model.config.forced_bos_token_id = forced_bos_token_id
      #print(tokenizer.src_lang, model.config.forced_bos_token_id)


    print_trainable_parameters(model)
    print(model)

    metric = load_metric('sacrebleu', trust_remote_code=True)

    def preprocess_parallel_function(examples):
      inputs = [ex[source_code] for ex in examples["translation"]] #splitting our json file into parts to get inputs and targets
      targets = [ex[target_code] for ex in examples["translation"]]
      #inputs = [prefix + inp for inp in inputs]
      #print(inputs, targets)
      model_inputs = tokenizer(inputs, max_length=max_length, padding=False, truncation=True)


      labels = tokenizer(targets, max_length=max_length, padding=False, truncation=True)

      #if padding == "max_length" and ignore_pad_token_for_loss:
      #labels["input_ids"] = [
      #    [(l if l != tokenizer.pad_token_id else -100) for l in label]
      #    for label in labels["input_ids"]]



      model_inputs["labels"] = labels["input_ids"]

      return model_inputs

    def postprocess_text(preds, labels):
      preds = [pred.strip() for pred in preds] #predictions of the model
      labels = [[label.strip()] for label in labels] #labels = gold standard

      return preds, labels

    def compute_metrics(eval_preds, ignore_pad_token_for_loss=False):
      preds, labels = eval_preds
      if isinstance(preds, tuple):
        preds = preds[0]
      decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
      # Replace -100 in the labels as we can't decode them.
      labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
      decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # EOS, BOS, etc.
      # Some simple post-processing
      decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

      result = metric.compute(predictions=decoded_preds, references=decoded_labels)
      prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
      result = {'bleu' : result['score']}
      result["gen_len"] = np.mean(prediction_lens)
      result = {k: round(v, 4) for k, v in result.items()}
      return result

    #how to do the test
    data = load_dataset("json", data_files=data_files)
    data_split_tmp = data['train'].train_test_split(test_size=0.2, seed=42)
    data_split_valid_test = data_split_tmp['test'].train_test_split(test_size=0.5, seed=42)
    data = DatasetDict({'train': data_split_tmp['train'].select(range(0, 20000)), #check your credits!!!
                        'valid': data_split_valid_test['train'].select(range(0, 5000)),
                        'test':  data_split_valid_test['test'].select(range(0, 5000)),})
    data.save_to_disk("data_save_test")
    # data = load_from_disk('/content/drive/MyDrive/data_save_test')
    #then requires load_from_disk() instead of load _dataset!!!
    print(data)
    column_names = data["train"].column_names
    print(column_names)
    print(data.column_names)
    print(data.num_rows)
    data = data.map(preprocess_parallel_function,
                    batched=True)
    label_pad_token_id = -100 #TODO??

    trainer = transformers.Seq2SeqTrainer(
        model=model,
        train_dataset=data["train"],
        eval_dataset=data["valid"],
        args=transformers.Seq2SeqTrainingArguments(
            report_to='none', #turn off  wandb
            per_device_train_batch_size=train_bs, #4, 12
            gradient_accumulation_steps=grad_acc,
            do_train=True,
            do_eval=True,
            per_device_eval_batch_size=4,
            eval_accumulation_steps=4,
            warmup_ratio=w_steps,
            lr_scheduler_type=lr_scheduler_type,
            num_train_epochs=n_epoch, #5?
            predict_with_generate=True,
            metric_for_best_model='bleu',
            load_best_model_at_end=True,
            learning_rate=lr, #0.1, 0.01, 0.001
            save_total_limit=1,
            generation_num_beams=4,
            save_strategy="epoch",
            eval_strategy="epoch",
            output_dir=output_dir,
        ),
        data_collator=transformers.DataCollatorForSeq2Seq(tokenizer,
                                                          label_pad_token_id=label_pad_token_id,
                                                          model=model),
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
    )
    model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
    trainer.train()

    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

In [None]:
!pip install -U accelerate


Collecting accelerate
  Downloading accelerate-1.12.0-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.12.0-py3-none-any.whl (380 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.9/380.9 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.34.2
    Uninstalling accelerate-0.34.2:
      Successfully uninstalled accelerate-0.34.2
Successfully installed accelerate-1.12.0


In [None]:
import accelerate
print(accelerate.__version__)

1.12.0


In [None]:
#set to gpu before!!!
main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


trainable params: 610879488 || all params: 610879488 || trainable%: 100.0
MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x MBartEncoderLayer(
          (self_attn): MBartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
      

  metric = load_metric('sacrebleu', trust_remote_code=True)


Saving the dataset (0/1 shards):   0%|          | 0/20000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 20000
    })
    valid: Dataset({
        features: ['translation'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 5000
    })
})
['translation']
{'train': ['translation'], 'valid': ['translation'], 'test': ['translation']}
{'train': 20000, 'valid': 5000, 'test': 5000}


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  trainer = transformers.Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.9438,0.937728,37.7376,41.0778


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp -r /content/data_save_test /content/drive/MyDrive/

In [None]:
!ls /content/drive/MyDrive/data_save_test/test

cache-2680b12dbb474d3e.arrow  data-00000-of-00001.arrow  state.json
cache-96d644eb4b5a15f7.arrow  dataset_info.json


In [None]:
!cp -r /content/models/mbart50-full-finetune-1epoch-1e4 /content/drive/MyDrive/models

In [None]:
!ls /content/drive/MyDrive/models/mbart50-full-finetune-1epoch-1e4

checkpoint-5000		model.safetensors	 tokenizer_config.json
config.json		sentencepiece.bpe.model  tokenizer.json
generation_config.json	special_tokens_map.json  training_args.bin


In [None]:
from datasets import load_from_disk

# Load the dataset from the saved path in Google Drive
data = load_from_disk('/content/drive/MyDrive/data_save_test')

In [None]:
print(data['train'])  # Shows the 'train' split of your dataset

Dataset({
    features: ['translation'],
    num_rows: 20000
})


In [None]:
model_dir = "/content/drive/MyDrive/models/mbart50-full-finetune-1epoch-1e4"

In [None]:
import os
print(os.listdir(model_dir))

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# mBART source language
tokenizer.src_lang = "en_XX"
