# Install Library

In [1]:
!pip install transformers datasets numpy evaluate sacrebleu

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m858.0 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-2.10.1-py3-none-any.whl (18 kB)
Installing collected packages: portalocker, sacrebleu, evaluate
Successfully installed evaluate-0.4.2 portalocker-2.10.1 sacrebleu-2.4.3


# Loading Library

In [2]:
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments,AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
import evaluate
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer

# Loading Data

### https://huggingface.co/datasets/Helsinki-NLP/opus_books
### https://huggingface.co/Helsinki-NLP/opus-mt-tc-big-en-fr

In [3]:
from datasets import load_dataset

ds = load_dataset("Helsinki-NLP/opus_books", "en-fr")
ds = ds.remove_columns('id')
ds

Downloading readme:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/127085 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 127085
    })
})

In [4]:
ds = ds['train'].train_test_split(train_size=0.8)
ds

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 101668
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 25417
    })
})

In [5]:
ds['train'][0]

{'translation': {'en': '"I could not find them; they did not come."',
  'fr': '-- Je ne les avais pas trouvés, ils ne sont pas venus.'}}

# Loading Model & Tokenizer

In [6]:
model_checkpionts = 'Helsinki-NLP/opus-mt-tc-big-en-fr'

# tokenizer = AutoTokenizer.from_pretrained(model_checkpionts)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpionts)

tokenizer = MarianTokenizer.from_pretrained(model_checkpionts)
model = MarianMTModel.from_pretrained(model_checkpionts)

tokenizer_config.json:   0%|          | 0.00/337 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/820k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.33M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/461M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

# Preprocessing In Text

In [7]:
ds['test']['translation'][:3]

[{'en': 'Voici le traité que je vous propose : si vous daignez y consentir, je ne serai pas votre maîtresse pour un instant fugitif, et en vertu d’un serment extorqué par la peur, mais je consacrerai tous les instants de ma vie à faire votre félicité, je serai toujours ce que j’ai été depuis quatre mois, et peut-être l’amour viendra-t-il couronner l’amitié.',
  'fr': 'This is the compact that I propose; if you deign to consent to it, I shall not be your mistress for a fleeting instant, and by virtue of an oath extorted by fear, but I shall consecrate every moment of my life to procuring your happiness, I shall be always what I have been for the last four months, and perhaps love will come to crown friendship.'},
 {'en': 'This boat was iron-plated.',
  'fr': 'Ce canot était fait en tôle boulonnée.'},
 {'en': 'Their object in lighting a fire was only to enable them to withstand the cold temperature of the night, as it was not employed in cooking the bird, which Neb kept for the next day.

In [8]:
source_lan = 'en'
target_lan = 'fr'
prefix = 'translate English to Frensh: '

def preprocessing(data):
    inputs = [example[source_lan] for example in data['translation']]
    target = [example[target_lan] for example in data['translation']]
    model_inputs = tokenizer(inputs, text_target=target, return_tensors="pt", padding=True,
                             truncation=True)
    return model_inputs

In [9]:
ds = ds.map(preprocessing, batched=True)
# ds['test'].map(preprocessing, batched=True)

Map:   0%|          | 0/101668 [00:00<?, ? examples/s]

Map:   0%|          | 0/25417 [00:00<?, ? examples/s]

In [10]:
ds

DatasetDict({
    train: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 101668
    })
    test: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 25417
    })
})

# Loading CollectionSeq2Seq

In [11]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_checkpionts)

# Create Metrics Function

In [12]:
metric = evaluate.load("sacrebleu")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [13]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

# Create Model Arguments

In [18]:
args = Seq2SeqTrainingArguments(
    output_dir="./Helsinki-mt-en-fr",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    num_train_epochs=1,
    fp16=True,
    warmup_steps=2000,
    logging_steps=2000, save_steps=4000

)

trainer = Seq2SeqTrainer(
model=model,
args=args,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
train_dataset=ds['train'],
eval_dataset=ds['test'])

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [19]:
trainer.train()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss
2000,0.9413
4000,0.2077
6000,0.1993


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[53016]], 'forced_eos_token_id': 43311}
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[53016]], 'forced_eos_token_id': 43311}


TrainOutput(global_step=6355, training_loss=0.4352935178743208, metrics={'train_runtime': 10814.6754, 'train_samples_per_second': 9.401, 'train_steps_per_second': 0.588, 'total_flos': 4.439994884318822e+16, 'train_loss': 0.4352935178743208, 'epoch': 1.0})

# Model Evaluation