In [1]:
!pip install transformers datasets evaluate sacrebleu accelerate -U

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacrebleu
  Downloading sacrebleu-2.3.2-py3-none-any.whl (119 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.7/119.7 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl

In [2]:
from datasets import load_dataset

engfrDF = load_dataset("opus_books", "en-fr")

Downloading builder script:   0%|          | 0.00/6.08k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/161k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/20.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/127085 [00:00<?, ? examples/s]

In [3]:
engfrDF = engfrDF["train"].train_test_split(test_size=0.2)

In [4]:
engfrDF["train"][0]

{'id': '21412',
 'translation': {'en': 'For my part, I was never so sensible of danger in my life; for, seeing above three hundred devils come roaring and open-mouthed to devour us, and having nothing to shelter us or retreat to, I gave myself over for lost; and, as it was, I believe I shall never care to cross those mountains again: I think I would much rather go a thousand leagues by sea, though I was sure to meet with a storm once a-week.',
  'fr': "Pour ma part, je n'eus jamais de ma vie un sentiment plus profond du danger; car, lorsque je vis plus de trois cents de ces bêtes infernales, poussant des rugissements et la gueule béante, s'avancer pour nous dévorer, sans que nous eussions rien pour nous réfugier ou nous donner retraite, j'avais cru que c'en était fait de moi. N'importe! je ne pense pas que je me soucie jamais de traverser les montagnes; j'aimerais mieux faire mille lieues en mer, fussé-je sûr d'essuyer une tempête par semaine."}}

In [5]:
engfrDF["test"][0]

{'id': '3607',
 'translation': {'en': '"Miss Bingley told me," said Jane, "that he never speaks much, unless among his intimate acquaintances. With _them_ he is remarkably agreeable."',
  'fr': '– Miss Bingley dit qu’il n’est jamais loquace avec les étrangers, mais que dans l’intimité c’est le plus aimable causeur.'}}

In [6]:
from transformers import AutoTokenizer

modelName = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(modelName)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [7]:
engLan = "en"
freLan = "fr"
prefix = "translate English to French: "


def preprocess_function(examples):
    inputs = [prefix + example[engLan] for example in examples["translation"]]
    targets = [example[freLan] for example in examples["translation"]]
    modelInputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return modelInputs

In [8]:
encodedDF = engfrDF.map(preprocess_function, batched=True)

Map:   0%|          | 0/101668 [00:00<?, ? examples/s]

Map:   0%|          | 0/25417 [00:00<?, ? examples/s]

In [9]:
from transformers import DataCollatorForSeq2Seq

dataCollator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=modelName)

In [10]:
import evaluate

metric = evaluate.load("sacrebleu")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [11]:
import numpy as np

#function that passes your predictions and labels to compute to calculate the SacreBLEU score
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)  #decoding the preds

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)  #decoding and extracting labels

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [12]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(modelName)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [13]:
training_args = Seq2SeqTrainingArguments(
    output_dir="Fnglish_French_Project",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    num_train_epochs=2,
    predict_with_generate=True,
)

In [14]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=encodedDF["train"],
    eval_dataset=encodedDF["test"],
    tokenizer=tokenizer,
    data_collator=dataCollator,
    compute_metrics=compute_metrics,
)

In [15]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,1.7368,1.522742,6.1764,17.5982
2,1.6959,1.49407,6.3606,17.5866




TrainOutput(global_step=25418, training_loss=1.7503972135695853, metrics={'train_runtime': 3279.0126, 'train_samples_per_second': 62.011, 'train_steps_per_second': 7.752, 'total_flos': 4187564155600896.0, 'train_loss': 1.7503972135695853, 'epoch': 2.0})

In [16]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [18]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1700938753.633dfdfd0f29.6477.0:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.73k [00:00<?, ?B/s]

'https://huggingface.co/sarthakb/Fnglish_French_Project/tree/main/'

In [20]:
from transformers import pipeline

translator = pipeline("translation", model="Fnglish_French_Project")



In [27]:
text = "translate English to French: This machine knows better french than I learnt in 5 years."
translator(text)

[{'translation_text': "Cette machine connaît mieux le français que j'ai appris en cinq ans."}]

In [24]:
text2 = "translate English to French: She ordered him to do it later after work"
translator(text2)

[{'translation_text': "Elle l'ordonna de le faire plus tard, après avoir travaillé."}]

In [26]:
text3 = "translate English to French: Hello! how are you doing?"
translator(text3)

[{'translation_text': 'Bonjour ! comment faites-vous ?'}]