# Kde4 dataset (french & English)

In [44]:
import torch

In [1]:
from datasets import load_dataset

raw_datasets = load_dataset("kde4", lang1="en", lang2="fr")

Found cached dataset kde4 (/Users/swayam/.cache/huggingface/datasets/kde4/en-fr-lang1=en,lang2=fr/0.0.0/243129fb2398d5b0b4f7f6831ab27ad84774b7ce374cf10f60f6e1ff331648ac)


  0%|          | 0/1 [00:00<?, ?it/s]

In [2]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 210173
    })
})

In [3]:
split_dataset = raw_datasets['train'].train_test_split(train_size=0.9, seed=20) # returns a DatasetDict
split_dataset['validation'] = split_dataset.pop("test")
split_dataset

Loading cached split indices for dataset at /Users/swayam/.cache/huggingface/datasets/kde4/en-fr-lang1=en,lang2=fr/0.0.0/243129fb2398d5b0b4f7f6831ab27ad84774b7ce374cf10f60f6e1ff331648ac/cache-496be247a58b47c1.arrow and /Users/swayam/.cache/huggingface/datasets/kde4/en-fr-lang1=en,lang2=fr/0.0.0/243129fb2398d5b0b4f7f6831ab27ad84774b7ce374cf10f60f6e1ff331648ac/cache-2c0faebb61cdd12e.arrow


DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 189155
    })
    validation: Dataset({
        features: ['id', 'translation'],
        num_rows: 21018
    })
})

In [4]:
split_dataset["train"][1]["translation"]

{'en': 'Default to expanded threads',
 'fr': 'Par défaut, développer les fils de discussion'}

# Data Preprocessing

In [None]:
!pip install "transformers[sentencepiece]"

In [6]:
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
# uses sentencepiece as tokenizing model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")

Downloading (…)olve/main/source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

In [7]:
def extract_langauge(batch):
    inputs = [example["en"] for example in batch['translation']]
    targets = [example["fr"] for example in batch['translation']]

    return {'inputs': inputs, 'targets': targets}

dataset = split_dataset.map(extract_langauge, batched=True, remove_columns=['id', 'translation'])

Map:   0%|          | 0/189155 [00:00<?, ? examples/s]

Map:   0%|          | 0/21018 [00:00<?, ? examples/s]

In [9]:
dataset['train'][10]

{'inputs': 'Text Cursor Movement', 'targets': 'Mouvements du curseur de texte'}

In [11]:
tokenizer.model_input_names

['input_ids', 'attention_mask']

In [14]:
# depricated method
# splitting into individual languages and tokenizing
def tokenize_text(batch):
    inputs = tokenizer(batch['inputs'], max_length=128, truncation=True)
    with tokenizer.as_target_tokenizer():
        targets = tokenizer(batch['targets'], max_length=128, truncation=True)

    inputs['labels'] = targets['input_ids']
    return inputs

In [16]:
# new method

max_length = 128


def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs

In [None]:
# tokenized_dataset = dataset.map(tokenize_text, batched=True, remove_columns=['inputs', 'targets'])
# tokenized_dataset

In [17]:
tokenized_dataset = split_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=split_dataset["train"].column_names,
)

tokenized_dataset

Map:   0%|          | 0/189155 [00:00<?, ? examples/s]

Map:   0%|          | 0/21018 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 189155
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21018
    })
})

## Padding
- Pad the input tokens using `<pad>` token
- Pad the target tokens by `-100` label so that they get ignored later in computing loss/metric

`DataCollatorForSeq2Seq` from `transformers` can do this

In [18]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [19]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [20]:
batch = data_collator([tokenized_dataset["train"][i] for i in range(1, 3)])
batch["labels"]

tensor([[  577,  5891,     2,  3184,    16,  2542,     5,  1710,     0,  -100,
          -100,  -100,  -100,  -100,  -100,  -100],
        [ 1211,     3,    49,  9409,  1211,     3, 29140,   817,  3124,   817,
           550,  7032,  5821,  7907, 12649,     0]])

In [24]:
batch.keys()

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

`decoder_input_ids` is the padded version of labels but padded by `<pad>`

In [21]:
batch['input_ids']

tensor([[47591,    12,  9842, 19634,     9,     0, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513],
        [ 1211,     3,    49,  9409,  1211,     3, 29140,   817,  3124,   817,
         28149,   139, 33712, 25218,     0]])

In [22]:
tokenizer.convert_ids_to_tokens([59513])

['<pad>']

The feature that `Seq2SeqTrainer` adds to its superclass `Trainer` is the ability to use the `generate()` method during evaluation or prediction. During training, the model will use the `decoder_input_ids` with an attention mask ensuring it does not use the tokens after the token it’s trying to predict, to speed up training. During inference we won’t be able to use those since we won’t have labels, so it’s a good idea to evaluate our model with the same setup.

In [None]:
!pip install sacrebleu

In [26]:
import evaluate

metric = evaluate.load("sacrebleu")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [28]:
predictions = [
    "This plugin lets you translate web pages between several languages automatically."
]
# references can be many
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]
metric.compute(predictions=predictions, references=references)

{'score': 46.750469682990165,
 'counts': [11, 6, 4, 3],
 'totals': [12, 11, 10, 9],
 'precisions': [91.66666666666667,
  54.54545454545455,
  40.0,
  33.333333333333336],
 'bp': 0.9200444146293233,
 'sys_len': 12,
 'ref_len': 13}

The score can go from 0 to 100, and higher is better.

To get from the model outputs to texts the metric can use, we will use the `tokenizer.batch_decode()` method. We just have to clean up all the `-100`s in the labels (the tokenizer will automatically do the same for the padding token):

In [32]:
import numpy as np

def compute_metric(eval_preds):
    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    decode_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s with <pad>  in the labels as we can't decode them

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [33]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f"marian-finetuned-kde4-en-to-fr",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    push_to_hub=False,
)

In [34]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metric,
)

In [None]:
trainer.evaluate(max_length=max_length)

In [None]:
# trainer.train()

# custom training loop using Accelerate

In [67]:
from torch.utils.data import DataLoader

tokenized_dataset.set_format("torch")

batch_size = 8

train_loader = DataLoader(tokenized_dataset['train'], batch_size=batch_size, shuffle=True, collate_fn=data_collator)
eval_loader = DataLoader(tokenized_dataset['validation'], batch_size=batch_size, collate_fn=data_collator)

In [68]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

In [69]:
from accelerate import Accelerator

accelerator = Accelerator()

model, train_loader, eval_loader, optimizer = accelerator.prepare(model, train_loader, eval_loader, optimizer)

In [70]:
from transformers import get_scheduler

num_train_epochs = 1
num_update_steps_per_epoch = len(train_loader)
num_training_steps = num_update_steps_per_epoch*num_train_epochs

lr_scheduler = get_scheduler(name="linear",
                             optimizer=optimizer,
                             num_warmup_steps=0,
                             num_training_steps=num_training_steps)


In [71]:
output_dir = "marian-finetuned-kde4-en-to-fr-accelerate"

In [72]:
def postprocess(predictions, labels):
    predictions = predictions.cpu().numpy()
    labels = labels.cpu().numpy()

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # replacing -100 from labels to <pad>
    labels = np.where(labels != 100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    return decoded_preds, decoded_labels

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    model.train()
    for batch in train_loader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        progress_bar.update(1)


    # evaluation
    model.eval()
    for batch in eval_loader:
        with torch.inference_mode():
            # outputs = model(**batch)
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch['input_ids'],
                attention_mask = batch['attention_mask'],
                max_length=128,
            )

            labels = batch['labels']

            # Necessary to pad predictions and labels for being gathered
            generated_tokens = accelerator.pad_across_processes(generated_tokens, dim=1, pad_index=tokenizer.pad_token_id)
            labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

            predictions_gathered = accelerator.gather(generated_tokens)
            labels_gathered = accelerator.gather(labels)

            decoded_preds, decoded_labels = postprocess(predictions_gathered, labels_gathered)
            
            metric.add_batch(predictions=decoded_preds, references=decoded_labels)

    result = metric.compute()
    print(f"epoch {epoch}, BLEU score: {result['score']:.2f}")




In [74]:
accelerator.wait_for_everyone()
unrwapped_model = accelerator.unwrap_model(model)
unrwapped_model.save_pretrained(output_dir, save_function=accelerator.save)

if accelerator.is_main_process:
    tokenizer.save_pretrained(output_dir)
    # now push the model if you want to
