# Import libraries

In [2]:
import pandas as pd

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm
from transformers import AutoTokenizer
import evaluate
import numpy as np

from transformers import DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

import matplotlib.pyplot as plt
from datasets import load_dataset
import datasets

import nltk
nltk.download('punkt')

torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Prepare Data

In [11]:
data = load_dataset("opus_wikipedia", "en-ru")

Downloading builder script:   0%|          | 0.00/4.58k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.87k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/57.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/572717 [00:00<?, ? examples/s]

# Download model and tokenizer

In [14]:
checkpoint = "Mprimus/T5-translation"
tokenizer = T5Tokenizer.from_pretrained(checkpoint)
model = T5ForConditionalGeneration.from_pretrained(checkpoint)

In [15]:
prefix = "translate en-ru: "

def preprocess_function(examples):
    inputs = [prefix + doc["en"] for doc in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding=True)
    
    lebel_inputs = [doc["ru"] for doc in examples["translation"]]
    labels = tokenizer(lebel_inputs, max_length=128, truncation=True, padding=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [16]:
train_data = data['train'].train_test_split(test_size=0.2)

In [18]:
tokenized_train_data = train_data.map(preprocess_function, batched=True)

Map:   0%|          | 0/458173 [00:00<?, ? examples/s]

Map:   0%|          | 0/114544 [00:00<?, ? examples/s]

In [19]:
tokenized_train_data

DatasetDict({
    train: Dataset({
        features: ['id', 'translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 458173
    })
    test: Dataset({
        features: ['id', 'translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 114544
    })
})

In [20]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    result = {key: value * 100 for key, value in result.items()}
    
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [21]:
# Login to push result in huggingface
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Train model

In [22]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [24]:
training_args = Seq2SeqTrainingArguments(
    output_dir="T5-translation",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    push_to_hub=True,
    report_to="none",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data["train"],
    eval_dataset=tokenized_train_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.4085,0.333595,46.4359,28.75,45.9462,45.9477,16.9831
2,0.3982,0.331446,46.5223,28.8483,46.0311,46.0347,16.9977




TrainOutput(global_step=14318, training_loss=0.40573415744236385, metrics={'train_runtime': 14665.6355, 'train_samples_per_second': 62.483, 'train_steps_per_second': 0.976, 'total_flos': 1.5571851768889344e+17, 'train_loss': 0.40573415744236385, 'epoch': 2.0})

In [25]:
# Push model to huggingface
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/977M [00:00<?, ?B/s]

'https://huggingface.co/Mprimus/T5-translation/tree/main/'

# Make predictions

In [39]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_checkpoint = "Mprimus/T5-translation"
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

prefix = "translate en-ru: "

def translate(text, max_length=128):
    text = prefix + text
    model_inputs = tokenizer.encode(text, return_tensors="pt")
    outputs = model.generate(model_inputs, num_beams=2, max_new_tokens=max_length)
    
    res = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    
    return res[0]

In [40]:
translate("Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him.")

'Звезда Гарри Поттера Дэниел Рэдклифф получает доступ к £20 млн ($41,1 млн) богатству, когда он становится 18 лет.'