### Finetuning Model for Spanish-English translation using Ted Talk subtitles dataset

In [1]:
# Loggin to hub
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Model, Dataset & Metric loading

In [2]:
model_checkpoint='Helsinki-NLP/opus-mt-en-mul'

In [3]:
from datasets import load_from_disk

In [4]:
data = load_from_disk("ted_trans_dataset")

In [5]:
data

DatasetDict({
    train: Dataset({
        features: ['Original_Sentence', 'Translate_SP', '__index_level_0__'],
        num_rows: 2454
    })
    validation: Dataset({
        features: ['Original_Sentence', 'Translate_SP', '__index_level_0__'],
        num_rows: 307
    })
    test: Dataset({
        features: ['Original_Sentence', 'Translate_SP', '__index_level_0__'],
        num_rows: 307
    })
})

In [6]:
# Carga métrica ROUGE
from datasets import load_metric
import evaluate

rouge_score = load_metric("rouge")

Tokenizer and Preprocessing

In [7]:
import transformers
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [8]:
max_input_length = 1024
max_target_length = 128


def preprocess_function(examples):
    inputs = [doc for doc in examples["Original_Sentence"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["Translate_SP"], max_length=max_target_length, truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [9]:
tokenized_datasets = data.map(preprocess_function, batched=True)

Loading cached processed dataset at ted_trans_dataset/train\cache-5d82fcb5ee8de5f6.arrow
Loading cached processed dataset at ted_trans_dataset/validation\cache-774e890d4668dad1.arrow
Loading cached processed dataset at ted_trans_dataset/test\cache-20596ff67f2124b7.arrow


In [10]:
# PT
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [11]:
from transformers import AutoConfig
config = AutoConfig.from_pretrained(model_checkpoint)

In [12]:
config

MarianConfig {
  "_name_or_path": "Helsinki-NLP/opus-mt-en-mul",
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "MarianMTModel"
  ],
  "attention_dropout": 0.0,
  "bad_words_ids": [
    [
      64109
    ]
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 512,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 64109,
  "decoder_vocab_size": 64110,
  "dropout": 0.1,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 0,
  "extra_pos_embeddings": 64110,
  "forced_eos_token_id": 0,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "

In [13]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

Metrics function

In [14]:
import numpy as np
import nltk
nltk.download('punkt')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = rouge_score.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\asr_l\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Training Args

In [15]:
# PT

model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned_es_en_translator",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=False,
)

Training

In [16]:
# PT

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [23]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: __index_level_0__, Original_Sentence, Translate_SP. If __index_level_0__, Original_Sentence, Translate_SP are not expected by `MarianMTModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2454
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 614

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

{'loss': 1.2981, 'learning_rate': 3.7133550488599353e-06, 'epoch': 0.81}


Model weights saved in opus-mt-en-mul-finetuned-xsum\checkpoint-500\pytorch_model.bin
tokenizer config file saved in opus-mt-en-mul-finetuned-xsum\checkpoint-500\tokenizer_config.json
Special tokens file saved in opus-mt-en-mul-finetuned-xsum\checkpoint-500\special_tokens_map.json

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[AThe following columns in the evaluation set don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: __index_level_0__, Original_Sentence, Translate_SP. If __index_level_0__, Original_Sentence, Translate_SP are not expected by `Maria

{'eval_loss': 1.3484039306640625, 'eval_rouge1': 64.9473, 'eval_rouge2': 42.2411, 'eval_rougeL': 61.9713, 'eval_rougeLsum': 62.9319, 'eval_gen_len': 61.5505, 'eval_runtime': 1156.3348, 'eval_samples_per_second': 0.265, 'eval_steps_per_second': 0.067, 'epoch': 1.0}
{'train_runtime': 2859.8927, 'train_samples_per_second': 0.858, 'train_steps_per_second': 0.215, 'train_loss': 1.3345782252010383, 'epoch': 1.0}





TrainOutput(global_step=614, training_loss=1.3345782252010383, metrics={'train_runtime': 2859.8927, 'train_samples_per_second': 0.858, 'train_steps_per_second': 0.215, 'train_loss': 1.3345782252010383, 'epoch': 1.0})

Saving Model and Testing it from local disk

In [24]:
trainer.save_model('mod_translate')

Saving model checkpoint to mod_translate
Configuration saved in mod_translate\config.json
Model weights saved in mod_translate\pytorch_model.bin
tokenizer config file saved in mod_translate\tokenizer_config.json
Special tokens file saved in mod_translate\special_tokens_map.json


In [17]:
model = AutoModelForSeq2SeqLM.from_pretrained('mod_translate')

loading configuration file mod_translate\config.json
Model config MarianConfig {
  "_name_or_path": "mod_translate",
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "MarianMTModel"
  ],
  "attention_dropout": 0.0,
  "bad_words_ids": [
    [
      64109
    ]
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 512,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 64109,
  "decoder_vocab_size": 64110,
  "dropout": 0.1,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 0,
  "extra_pos_embeddings": 64110,
  "forced_eos_token_id": 0,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LABE

In [18]:
tokenizer = AutoTokenizer.from_pretrained('mod_translate')

Didn't find file mod_translate\target_vocab.json. We won't load it.
Didn't find file mod_translate\added_tokens.json. We won't load it.
loading file mod_translate\source.spm
loading file mod_translate\target.spm
loading file mod_translate\vocab.json
loading file None
loading file mod_translate\tokenizer_config.json
loading file None
loading file mod_translate\special_tokens_map.json


In [19]:
text = tokenizer(['Hi, trying this model out'],return_tensors="pt")

In [20]:
output_text = model.generate(text['input_ids'])



In [21]:
output = tokenizer.decode(output_text.squeeze(), skip_special_tokens=True)

In [22]:
output

'Hola, probando este modelo'

Evaluate with ROUGE on test dataset

In [19]:
data_test = data['test']

In [20]:
data_test.shape

(307, 3)

In [21]:
dat_test = data_test['Original_Sentence']

In [22]:
tokens = tokenizer(dat_test ,return_tensors="pt", padding = True)

In [55]:
tokens

{'input_ids': tensor([[36605, 32469,  1025,  ..., 64109, 64109, 64109],
        [  252,    67,    44,  ..., 64109, 64109, 64109],
        [42664,    25, 32208,  ..., 64109, 64109, 64109],
        ...,
        [  161,    72, 19193,  ..., 64109, 64109, 64109],
        [ 1486,     6,    47,  ..., 64109, 64109, 64109],
        [41413, 12729, 38660,  ..., 64109, 64109, 64109]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [23]:
predictions = model.generate(tokens['input_ids'])



KeyboardInterrupt: 

In [None]:
rouge_score.compute(tokenizer.decode(predictions),data['Translate_SP']) 

Pushing model to hub

In [20]:
model_name = model_checkpoint.split("/")[-1]


In [42]:
#model.push_to_hub(f"{model_name}-finetuned_en_sp_translator")

In [22]:
tokenizer.push_to_hub(f"{model_name}-finetuned_en_sp_translator")

Adding files tracked by Git LFS: ['source.spm', 'target.spm']. This may take a bit of time if the files are large.


Upload file source.spm:   4%|4         | 32.0k/772k [00:00<?, ?B/s]

Upload file target.spm:   5%|4         | 32.0k/690k [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/razwand/opus-mt-en-mul-finetuned_en_sp_translator
   24eab9a..dba96c0  main -> main



'https://huggingface.co/razwand/opus-mt-en-mul-finetuned_en_sp_translator/commit/dba96c0959ef0f0591166b34eccd225b25cf1ccf'