In [1]:
!pip install accelerate -U



In [2]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118


In [3]:
!pip install transformers



In [4]:
!pip install datasets



In [5]:
import pandas as pd
data = pd.read_csv('en-fr_sm.csv')
data.head()

Unnamed: 0,en,fr
0,Changing Lives | Changing Society | How It Wor...,Il a transformé notre vie | Il a transformé la...
1,Site map,Plan du site
2,Feedback,Rétroaction
3,Credits,Crédits
4,Français,English


In [6]:
data['nos'] = data['en'].apply(lambda x:len(x.split()))
data

Unnamed: 0,en,fr,nos
0,Changing Lives | Changing Society | How It Wor...,Il a transformé notre vie | Il a transformé la...,37
1,Site map,Plan du site,2
2,Feedback,Rétroaction,1
3,Credits,Crédits,1
4,Français,English,1
...,...,...,...
995,A nova is a star that absorbs matter from a ne...,La matière absorbée finit par réchauffer l'éto...,23
996,It is a rare and spectacular event.,Il s'agit d'un phénomène rare et plutôt specta...,7
997,"In 1977, he became the French editor of the Na...","En 1977, il devient l'éditeur francophone du N...",25
998,"In 1978, Lemay began the daunting task of asse...","En 1978, il entreprend la tâche colossale de r...",20


In [7]:
max(data.nos)

59

In [8]:
from datasets import Dataset
data = Dataset.from_pandas(data)
data

Dataset({
    features: ['en', 'fr', 'nos'],
    num_rows: 1000
})

In [9]:
data = data.train_test_split(test_size=0.2)

In [10]:
data

DatasetDict({
    train: Dataset({
        features: ['en', 'fr', 'nos'],
        num_rows: 800
    })
    test: Dataset({
        features: ['en', 'fr', 'nos'],
        num_rows: 200
    })
})

In [11]:
source = "en"
target = "fr"


def preprocess_function(examples):
    inputs = examples[source]
    targets = examples[target]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=2048, truncation=True)
    return model_inputs

In [12]:
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM

device = "cuda"

tokenizer = AutoTokenizer.from_pretrained('facebook/bart-base')
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-base")



In [13]:
tokenized_data = data.map(preprocess_function, batched=True)
tokenized_data

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['en', 'fr', 'nos', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 800
    })
    test: Dataset({
        features: ['en', 'fr', 'nos', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 200
    })
})

In [14]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [15]:
import accelerate

In [16]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())

2.3.0+cu121
True


In [17]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

In [18]:
training_args = Seq2SeqTrainingArguments(
    output_dir="English_to_French",
    eval_strategy="epoch",
    learning_rate=2e-03,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)


In [19]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,6.454015
2,6.859700,6.416284


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


TrainOutput(global_step=800, training_loss=6.66731689453125, metrics={'train_runtime': 141.6627, 'train_samples_per_second': 11.294, 'train_steps_per_second': 5.647, 'total_flos': 33566468382720.0, 'train_loss': 6.66731689453125, 'epoch': 2.0})

In [24]:
from transformers import pipeline

text = "Hi, how are you?"
translator = pipeline("translation_en_to_fr", model="/content/English_to_French/checkpoint-500", max_length=2048)
translator(text)

[{'translation_text': "''',,',''é'' de''', de',' de,,, de,'é,','"}]