#### This notebook illustrates transfer-learning of a T5 model using IWSLT with English and French

In [None]:
import warnings
warnings.filterwarnings("ignore")

# HuggingFace login

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Load data


`datasets.ReadInstruction` give a instruction to loaddataset about the specifics of the data that needs to be loaded
`'train'` train means get train data, `to=50`and `unit=%`  means get the first $50 \%$

`load_dataset` this loads the dataset `language_pair=("en", "fr")` means translaion data from english to french  `year="2016"` means the 2016 version of data

We used this to get the trainig dataset

In [None]:
import torch
import datasets
from datasets import load_dataset

## Train data

In [None]:
ri = (datasets.ReadInstruction('train', to=50, unit='%'))#+
train_0_50pct_ds = load_dataset("IWSLT/ted_talks_iwslt", language_pair=("en", "fr"), year="2016", split=ri)
print(train_0_50pct_ds)

Dataset({
    features: ['translation'],
    num_rows: 2035
})


## Test Data

In [None]:
ri = (datasets.ReadInstruction('train', from_=95, unit='%')) #
test_95_100pct_ds = load_dataset("IWSLT/ted_talks_iwslt", language_pair=("en", "fr"), year="2016", split=ri)

In [None]:
test_95_100pct_ds

Dataset({
    features: ['translation'],
    num_rows: 204
})

In [None]:
test_95_100pct_ds[0]['translation']

{'en': 'What happens when a black man visits Aspen? Singer/songwriter Stew and his band are about to let you know.',
 'fr': 'Que se passe-t-il quand un noir visite Aspen? L\'auteur-compositeur-interprète Stew est sur le point de vous le faire découvrir, avec l\'humour acide et intelligent qui caractérise son travail. (Indice : "c\'est un pays des merveilles d\'hiver dans l\'antre de la bête"). Stew est accompagné sur scène par Heidi Rodewald à la basse et Jon Spurney au clavier et à la guitare.'}

## Validation data

In [None]:
ri = (datasets.ReadInstruction('train', from_=95, unit='%')) #
val2014_95_100pct_ds = load_dataset("IWSLT/ted_talks_iwslt", language_pair=("en", "fr"), year="2014", split=ri)

In [None]:
ri = (datasets.ReadInstruction('train', from_=95, unit='%')) #
val2015_95_100pct_ds = load_dataset("IWSLT/ted_talks_iwslt", language_pair=("en", "fr"), year="2015", split=ri)

In [None]:
validation_dataset = datasets.concatenate_datasets([val2014_95_100pct_ds, val2015_95_100pct_ds])

## Model and Tokenizer

#### get the model and tokenizer

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
checkpoint = "google-t5/t5-small"
modelname='t5-small'
model = T5ForConditionalGeneration.from_pretrained(modelname)
tokenizer = T5Tokenizer.from_pretrained(modelname)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


#### look at the structure of datapoints

In [None]:
from tqdm import tqdm

for i, example in tqdm(zip(range(20), iter(test_95_100pct_ds)), total=20):
    inputs=example["translation"]['en'];
    print(f"{i} {inputs}")
    targets = example["translation"]['fr'];
    model_inputs0 = tokenizer(inputs, text_target=targets, max_length=128, truncation=True);
    print(f"{i} {model_inputs0}")

100%|█████████████████████████████████████████| 20/20 [00:00<00:00, 3551.94it/s]

0 What happens when a black man visits Aspen? Singer/songwriter Stew and his band are about to let you know.
0 {'input_ids': [363, 2906, 116, 3, 9, 1001, 388, 8305, 282, 3208, 58, 24366, 87, 21101, 3557, 210, 11, 112, 1928, 33, 81, 12, 752, 25, 214, 5, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [7227, 142, 8063, 18, 17, 18, 173, 4679, 73, 9691, 13497, 282, 3208, 58, 301, 31, 9474, 18, 287, 19882, 5348, 18, 3870, 102, 52, 9831, 3557, 210, 259, 244, 90, 500, 20, 327, 90, 1143, 7698, 6, 393, 3, 40, 31, 4884, 1211, 3562, 15, 3, 15, 17, 7951, 285, 26561, 15, 520, 2954, 5, 41, 1570, 26, 867, 3, 10, 96, 75, 31, 222, 73, 3277, 93, 27015, 7, 3, 26, 31, 15123, 247, 3, 40, 31, 288, 60, 20, 50, 3, 115, 7425, 15, 121, 137, 3557, 210, 259, 20342, 244, 12739, 260, 30640, 8222, 15, 11611, 3, 85, 50, 19379, 3, 15, 17, 8178, 17740, 3186, 185, 3, 18780, 972, 3, 15, 17, 3, 85, 50, 5507, 15, 5, 1]}
1 Stew: "Black Men Ski"
1 {'input_ids




#### Side-bar: in the context of natural language processing, "glue", "mrpc" refers to a specific dataset in the "General Language Understanding Evaluation" (GLUE) benchmark, where "mrpc" stands for "Microsoft Research Paraphrase Corpus" - a collection of sentence pairs manually labeled as either paraphrases or not, used to train models on the task of identifying semantically equivalent sentences. We will demontrate tokenization using that dataset. Compare the structure of raw_datasets with the structure of test_95_100pct_ds, and create  tokenize_function for test_95_100pct_ds with this comparison in mind.

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint1 = "bert-base-uncased"
tokenizer1 = AutoTokenizer.from_pretrained(checkpoint1)


def tokenize_function(example):
    return tokenizer1(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator1 = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

#### End of sidebar

#### why does not the next example work similarly? Modify it to work!

In [None]:
raw_datasets['train'][0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [None]:
test_95_100pct_ds[0]

{'translation': {'en': 'What happens when a black man visits Aspen? Singer/songwriter Stew and his band are about to let you know.',
  'fr': 'Que se passe-t-il quand un noir visite Aspen? L\'auteur-compositeur-interprète Stew est sur le point de vous le faire découvrir, avec l\'humour acide et intelligent qui caractérise son travail. (Indice : "c\'est un pays des merveilles d\'hiver dans l\'antre de la bête"). Stew est accompagné sur scène par Heidi Rodewald à la basse et Jon Spurney au clavier et à la guitare.'}}

In [None]:
def tokenize_function(example):
    return tokenizer(example["translation"]['en'], example["translation"]['fr'], truncation=True)


tokenized_dataset = test_95_100pct_ds.map(tokenize_function, batched=True)

Map:   0%|          | 0/204 [00:00<?, ? examples/s]

TypeError: list indices must be integers or slices, not str

**Note** The fuction returns error beacuse of the function creating the bacthes, it doesn't give a list of srtings for english and french sentences

#### Modify here: rename tokenize_function into preprocess_function

In [None]:
source_lang = "en"
target_lang = "fr"
def preprocess_function(examples):
    inputs = [ example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]];
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True);
    return model_inputs

**Note**, we are creating a list of sentences, for english and french and then passign them to tokenizer

In [None]:
%%capture --no-display
tokenized_dataset_train = train_0_50pct_ds.map(preprocess_function, batched=True)
tokenized_dataset_test = test_95_100pct_ds.map(preprocess_function, batched=True)
tokenized_dataset_val = validation_dataset.map(preprocess_function, batched=True)

In [None]:
print(tokenized_dataset_test[0])

{'translation': {'en': 'What happens when a black man visits Aspen? Singer/songwriter Stew and his band are about to let you know.', 'fr': 'Que se passe-t-il quand un noir visite Aspen? L\'auteur-compositeur-interprète Stew est sur le point de vous le faire découvrir, avec l\'humour acide et intelligent qui caractérise son travail. (Indice : "c\'est un pays des merveilles d\'hiver dans l\'antre de la bête"). Stew est accompagné sur scène par Heidi Rodewald à la basse et Jon Spurney au clavier et à la guitare.'}, 'input_ids': [363, 2906, 116, 3, 9, 1001, 388, 8305, 282, 3208, 58, 24366, 87, 21101, 3557, 210, 11, 112, 1928, 33, 81, 12, 752, 25, 214, 5, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [7227, 142, 8063, 18, 17, 18, 173, 4679, 73, 9691, 13497, 282, 3208, 58, 301, 31, 9474, 18, 287, 19882, 5348, 18, 3870, 102, 52, 9831, 3557, 210, 259, 244, 90, 500, 20, 327, 90, 1143, 7698, 6, 393, 3, 40, 31, 4884, 1211, 3562,

#### What is input_ids in the above cell?

## load model

In [None]:
from transformers import DataCollatorForSeq2Seq

checkpoint = "google-t5/t5-small"
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

#### compute_metrics is one of the arguments in Seq2SeqTrainer
#### look it up and explain what it does; what kind of metric is SacleBLEU?

## eval criteria


# BLEU Score (Bilingual Evaluation Understudy)
sacrebleu computes the Blue score of two texts. The BLEU (Bilingual Evaluation Understudy) score is a metric used in natural language processing (NLP) and machine translation to evaluate the quality of generated text against one or more high-quality reference translations, measuring similarity by comparing n-grams (sequences of words) and applying a brevity penalty

## Formula

$$
BLEU = BP \times \exp \left( \sum_{n=1}^{N} w_n \log p_n \right)
$$

Where:
**BP (Brevity Penalty)**:
  $$
  BP =
  \begin{cases}
    1, & \text{if } c > r \\
    e^{(1 - r/c)}, & \text{if } c \leq r
  \end{cases}
  $$
  $ c $ = candidate translation length, $ r $ = reference length.

$ p_n $ = precision for n-grams.
$ w_n $ = weight for n-gram precision (usually uniform, e.g., $ \frac{1}{4} $ for BLEU-4).
$ N $ = max n-gram order.


## Example Calculation

**Reference Sentence:**  
*"The cat is on the mat."*  

**Candidate Translation:**  
*"The cat is mat."*  

### Step 1: N-gram Precision

| N-gram | Candidate | Matches | Precision |
|--------|----------|---------|-----------|
| **Unigram** | {The, cat, is, mat} | {The, cat, is, mat} | $ 4/4 = 1.0 $ |
| **Bigram** | {The cat, cat is, is mat} | {The cat, cat is} | $ 2/3 = 0.67 $ |

### Step 2: Compute Geometric Mean
$$
\exp \left( \frac{1}{2} (\log 1.0 + \log 0.67) \right) = \exp(-0.20) \approx 0.82
$$

### Step 3: Brevity Penalty
$$
BP = e^{(1 - 6/4)} = e^{-0.5} \approx 0.61
$$

### Step 4: Final BLEU Score
$$
BLEU = 0.61 \times 0.82 = 0.50
$$



In [None]:
import numpy as np

import evaluate

metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

# Training

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

#### Instantiate Seq2SeqTrainingArguments and Seq2SeqTrainer here

In [None]:
import os
if not os.path.isdir('./results'):
    os.mkdir('./results')
if not os.path.isdir('./logs'):
    os.mkdir('./logs')


In [None]:

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",         # Directory for model checkpoints and logs
    evaluation_strategy="epoch",    # Evaluation strategy: "no", "steps", "epoch"
    learning_rate=5e-5,             # Learning rate
    per_device_train_batch_size=8,  # Training batch size per device
    per_device_eval_batch_size=8,   # Evaluation batch size per device
    weight_decay=0.01,              # Weight decay (L2 regularization)
    save_total_limit=3,             # Limit on saved checkpoints
    num_train_epochs=2,             # Number of training epochs
    predict_with_generate=True,     # Use `generate()` for evaluation (for text generation tasks)
    logging_dir="./logs",           # Directory for logs
    logging_steps=500,              # Log every X steps
    save_strategy="epoch",          # When to save checkpoints ("no", "steps", "epoch")
    eval_steps=100,                 # Evaluate every X steps
    report_to="none",               # Reporting (e.g., "wandb", "tensorboard", or "none")
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,                     # Pretrained model (e.g., T5, BART)
    args=training_args,               # Training arguments
    train_dataset=tokenized_dataset_train,      # Training dataset
    eval_dataset=tokenized_dataset_val ,        # Evaluation dataset
    data_collator=data_collator,      # Collator for batching
    compute_metrics=compute_metrics,  # Function to compute evaluation metrics
)

#### make sure that you first test the trainer with num_train_epochs=2

In [None]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,No log,1.321376,6.2094,17.7485
2,No log,1.308383,6.4017,17.7964


TrainOutput(global_step=256, training_loss=1.628257155418396, metrics={'train_runtime': 34.2992, 'train_samples_per_second': 118.662, 'train_steps_per_second': 7.464, 'total_flos': 122870001598464.0, 'train_loss': 1.628257155418396, 'epoch': 2.0})

#### call trainer's methods evaluate and predict and print the results of predict


In [None]:
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

# Perform prediction on the evaluation dataset
predictions = trainer.predict(tokenized_dataset_test)
print("Prediction Results:", predictions.predictions)

# Decode the first few predicted sentences
decoded_predictions = tokenizer.batch_decode(predictions.predictions, skip_special_tokens=True)
print("Sample Predictions:", decoded_predictions[:5])

Evaluation Results: {'eval_loss': 1.3083829879760742, 'eval_bleu': 6.4017, 'eval_gen_len': 17.7964, 'eval_runtime': 5.1824, 'eval_samples_per_second': 64.449, 'eval_steps_per_second': 4.052, 'epoch': 2.0}
Prediction Results: [[    0  7227   142 ...   312     3  5367]
 [    0  3557   210 ...     0     0     0]
 [    0 17129  5545 ...     3 26375    73]
 ...
 [    0  1955   276 ...     1     0     0]
 [    0  3039    73 ...    15    20  2143]
 [    0  9236     9 ...    10     3 15005]]
Sample Predictions: ["Que se passe-t-il lorsqu'un noir visite Aspen? Le chant", 'Stew: « Black Men Ski »', 'Larry Page et Sergey Brin, cofondateurs de Google, offrent un', 'Sergey Brin + Larry Page: La genèse de Google', 'La Violiniste Natalie MacMaster et le Directeur musical TED Thomas Do']
