In [None]:
!pip install datasets

In [None]:
!pip install -U accelerate

In [None]:
!pip install -U transformers

In [None]:
from datasets import load_dataset
dataset = load_dataset("stas/wmt16-en-ro-pre-processed", cache_dir="./wmt16-en_ro")

In [None]:
dataset

In [4]:
def flatten(batch):
    batch['en'] = batch['translation']['en']
    batch['ro'] = batch['translation']['ro']

    return batch

In [None]:
train = dataset['train'].map( flatten )
test = dataset['test'].map( flatten )
validation = dataset['validation'].map( flatten )
train.save_to_disk("./dataset/train")
test.save_to_disk("./dataset/test")
validation.save_to_disk("./dataset/validation")

In [23]:
train.shape

(610320, 3)

In [11]:
test.shape

(1999, 3)

In [12]:
validation.shape

(1999, 3)

In [None]:
print(train['ro'][0])

In [None]:
print(train['en'][0])

In [10]:
from tokenizers import normalizers, pre_tokenizers, Tokenizer, models, trainers
bpe_tokenizer = Tokenizer(models.BPE())
bpe_tokenizer.normalizer = normalizers.Lowercase()
bpe_tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

trainer = trainers.BpeTrainer(
    vocab_size=50265,
    special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"],
    initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
)

In [11]:
def batch_iterator():
    batch_length = 1000
    for i in range(0, len(train), batch_length):
        yield train[i : i + batch_length]["ro"]

bpe_tokenizer.train_from_iterator( batch_iterator(), length=len(train), trainer=trainer )
bpe_tokenizer.save("./ro_tokenizer.json")

In [12]:
from transformers import AutoTokenizer, PreTrainedTokenizerFast

en_tokenizer = AutoTokenizer.from_pretrained( "facebook/bart-base" );
ro_tokenizer = PreTrainedTokenizerFast.from_pretrained( "./ro_tokenizer.json" )
ro_tokenizer.pad_token = en_tokenizer.pad_token

def tokenize_dataset(sample):
    input = en_tokenizer(sample['en'], padding='max_length', max_length=120, truncation=True)
    label = ro_tokenizer(sample['ro'], padding='max_length', max_length=120, truncation=True)

    input["decoder_input_ids"] = label["input_ids"]
    input["decoder_attention_mask"] = label["attention_mask"]
    input["labels"] = label["input_ids"]

    return input

train_tokenized = train.map(tokenize_dataset, batched=True)
test_tokenized = test.map(tokenize_dataset, batched=True)
validation_tokenized = validation.map(tokenize_dataset, batched=True)



Map:   0%|          | 0/1999 [00:00<?, ? examples/s]

In [13]:
from transformers import BartForConditionalGeneration
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

model = BartForConditionalGeneration.from_pretrained(  "facebook/bart-base" )

training_args = Seq2SeqTrainingArguments(
    output_dir="./",
    evaluation_strategy="steps",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    predict_with_generate=True,
    logging_steps=2,
    save_steps=64,
    eval_steps=64,
    warmup_steps=1,
    max_steps=128,
    overwrite_output_dir=True,
    save_total_limit=3,
    fp16=False,
)



In [14]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=validation_tokenized,
)

trainer.train()

Step,Training Loss,Validation Loss
64,3.2134,2.332676
128,1.9038,0.905992


TrainOutput(global_step=128, training_loss=4.970657631754875, metrics={'train_runtime': 4216.4029, 'train_samples_per_second': 0.061, 'train_steps_per_second': 0.03, 'total_flos': 18292093747200.0, 'train_loss': 4.970657631754875, 'epoch': 0.0})

In [15]:
predictions = trainer.predict(validation_tokenized)



In [None]:
predictions

In [None]:
results = trainer.evaluate()