# Creating a Text translator by finetuning model

In [1]:
!pip install datasets
!pip install git+https://github.com/huggingface/transformers

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-te0oxbin
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-te0oxbin
  Resolved https://github.com/huggingface/transformers to commit f40b87de0ca234df61f76928956c4a2118c0b548
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [2]:
!pip install sentencepiece



In [3]:
#@ loading datasets
from datasets import load_dataset
raw_datasets = load_dataset('kde4', lang1='en', lang2='fr')                     # loading datasets for english to french translation
raw_datasets                                                                    # inspecting the datasets

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 210173
    })
})

In [4]:
#@ creating train test splits
from sklearn.model_selection import train_test_split
split_datasets = raw_datasets["train"].train_test_split(train_size=0.9, seed=20)            # 90 percent to training and seed for reproduciblity
split_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 189155
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 21018
    })
})

In [5]:
split_datasets["validation"] = split_datasets.pop("test")                                  # renaming test to validation set

In [6]:
# # finding index that contain email string
# for index,datasets in enumerate(split_datasets["train"]):
#   if "email" in datasets["translation"]["en"]:
#     print(index)
# observing if email is translate to email or what
split_datasets["train"][356]["translation"]                                               # checking few english words tranlsation

{'en': 'Sends the chart as an email attachment.',
 'fr': 'Envoie le diagramme comme pièce jointe.'}

In [7]:
# using some pretrained model to see if there is any differences
from transformers import pipeline
model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
translator = pipeline("translation",                                            # tasks
                      model = model_checkpoint)                                 # model used


  return self.fget.__get__(instance, owner)()


In [8]:
translator("Default to expanded threads")                                      # inspecting the translation of pretrained model

[{'translation_text': 'Par défaut pour les threads élargis'}]

In [9]:
#@ Preprocessing datasets
'''1.Create two columns named as inputs and targets and tokenized them separately and pad to equal length'''
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
max_length = 128                                                                                     # maximum length for inputs and targets
def preprocess_function(datasets):
  inputs = [dataset["en"] for dataset in datasets["translation"]]                                    # inputs english data
  targets = [dataset["fr"] for dataset in datasets["translation"]]                                   # targets french corresponding data
  model_inputs = tokenizer(inputs, text_target = targets, max_length=max_length, truncation=True)   # tokenizes the data for model_inputs
  return model_inputs

In [10]:
tokenized_datasets = split_datasets.map(preprocess_function,                                         # function for tokenization
                                        batched=True,                                                # batch for multiprocessing
                                        remove_columns=split_datasets["train"].column_names)         # removing columns from datasets

In [11]:
tokenized_datasets                                             # inspecting the tokenized datasts

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 189155
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21018
    })
})

In [12]:
#@ Finetuning the pretrained models
from transformers import AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)                      # pretrained model


In [13]:
#@ using data collator from transformers for padding
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)                       # using collaotr for dynamic padding

In [14]:
#@ Inspecting the whether the dynamic padding is working properly or not
batch = data_collator([tokenized_datasets['train'][i] for i in range(1,3)])
batch

{'input_ids': tensor([[47591,    12,  9842, 19634,     9,     0, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513],
        [ 1211,     3,    49,  9409,  1211,     3, 29140,   817,  3124,   817,
         28149,   139, 33712, 25218,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([[  577,  5891,     2,  3184,    16,  2542,     5,  1710,     0,  -100,
          -100,  -100,  -100,  -100,  -100,  -100],
        [ 1211,     3,    49,  9409,  1211,     3, 29140,   817,  3124,   817,
           550,  7032,  5821,  7907, 12649,     0]]), 'decoder_input_ids': tensor([[59513,   577,  5891,     2,  3184,    16,  2542,     5,  1710,     0,
         59513, 59513, 59513, 59513, 59513, 59513],
        [59513,  1211,     3,    49,  9409,  1211,     3, 29140,   817,  3124,
           817,   550,  7032,  5821,  7907, 12649]])}

In [15]:
batch["decoder_input_ids"]                                                  # inspecting whether the dynamic padding is working or not

tensor([[59513,   577,  5891,     2,  3184,    16,  2542,     5,  1710,     0,
         59513, 59513, 59513, 59513, 59513, 59513],
        [59513,  1211,     3,    49,  9409,  1211,     3, 29140,   817,  3124,
           817,   550,  7032,  5821,  7907, 12649]])

In [16]:
!pip install sacrebleu



In [17]:
!pip install evaluate



In [18]:
import evaluate
metric = evaluate.load("sacrebleu")                                      # bleu compares generated translation to refrence translation

In [19]:
#@ designing custom evaluation funciton
import numpy as np

def compute_metrics(eval_preds):
  preds,labels = eval_preds
  # in this case model returns more than one predicitons
  if isinstance(preds, tuple):                                               # if contain multiple preds
    preds = preds[0]                                                         # select first one

  decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)    # remove special tokens predicitons

  # replace -100s in the labels as we can't decode them
  labels = np.where(labels!= -100, labels, tokenizer.pad_token_id)           # give labels
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)  # remove special token labels

  decoded_labels = [label.strip() for label in decoded_labels]
  decoded_preds = [pred.strip() for pred in decoded_preds]

  result = metric.compute(predictions=decoded_preds, references=decoded_labels) # calculate blue by comparing predicitions  and reference labels
  return {"bleu" : result["score"]}

In [20]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [21]:
!pip install torch
!pip install accelerate



In [25]:
import torch
import accelerate
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(                                # preparing training args
    f'marian-finetunded-kde4-en-to-fr',                         # directory name
    evaluation_strategy="no",                                   # no evaluation
    save_strategy="epoch",                                      # save weights at every epoch
    learning_rate=2e-5,                                         # learning rate
    per_device_train_batch_size=32,                             # specifying training batch size
    per_device_eval_batch_size=64,                              # specifying validation batch size
    weight_decay=0.01,                                          # weight decay
    save_total_limit=3,                                         # save limit in hub
    num_train_epochs=3,                                         # training epochs
    predict_with_generate=True,                                 # predicions generates output
    fp16=True,                                                  # speeds up training
    push_to_hub=True,                                           # pushes model to hugging face hub
)

In [26]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,                                                         # model
    args,                                                          # training args
    train_dataset=tokenized_datasets["train"],                     # training datasets
    eval_dataset=tokenized_datasets["validation"],                 # validation datasets
    data_collator=data_collator,                                   # data collator for dynamic padding
    tokenizer=tokenizer,                                           # tokenizing
    compute_metrics=compute_metrics,                               # metrics for eval
)

In [28]:
trainer.train()

Step,Training Loss
500,1.4229
1000,1.2337
1500,1.1846
2000,1.1252
2500,1.1131
3000,1.0708
3500,1.0634
4000,1.032
4500,1.0181
5000,1.0084


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}


Step,Training Loss
500,1.4229
1000,1.2337
1500,1.1846
2000,1.1252
2500,1.1131
3000,1.0708
3500,1.0634
4000,1.032
4500,1.0181
5000,1.0084


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}


TrainOutput(global_step=17736, training_loss=0.9375644644262773, metrics={'train_runtime': 3257.6277, 'train_samples_per_second': 174.196, 'train_steps_per_second': 5.444, 'total_flos': 1.1305306504691712e+16, 'train_loss': 0.9375644644262773, 'epoch': 3.0})

In [29]:
trainer.push_to_hub(tags="translation", commit_message="Training complete")

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}


CommitInfo(commit_url='https://huggingface.co/Utshav/marian-finetunded-kde4-en-to-fr/commit/7cc702a22cce7c5237a1140701595b993efae773', commit_message='Training complete', commit_description='', oid='7cc702a22cce7c5237a1140701595b993efae773', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
#@ Inferencing using finetuned model
from transformers import pipeline
model_checkpoint = "Utshav/marian-finetunded-kde4-en-to-fr"
translator = pipeline("translation", model=model_checkpoint)


In [33]:
translator("Default to expanded threads")

[{'translation_text': 'Par défaut, développer les fils de discussion'}]