# Import libraries

In [1]:
import pandas as pd

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm
from transformers import AutoTokenizer
import evaluate
import numpy as np

from transformers import DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

import matplotlib.pyplot as plt
from datasets import load_dataset
from datasets import concatenate_datasets
import datasets

import nltk
nltk.download('punkt')

torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Prepare Data

In [3]:
data_fr = load_dataset("opus_wikipedia", "en-ru")
data = pd.read_csv("../data/raw/rus.txt", sep="\t", header=None, names=["en", "ru", "attr"])
fifi = []
Len = len(data_fr["train"]["translation"])
List_nd = data_fr["train"]["translation"]
for i in tqdm(range(len(data["en"].tolist())), "Prepare 1st part of data"):
    fifi.append({"en": data["en"][i], "ru": data["ru"][i]})
for i in tqdm(range(Len), "Prepare 2nd part of data"):
    fifi.append(List_nd[i])
    
data = datasets.DatasetDict({
    "train": datasets.Dataset.from_dict({
            "translation": fifi
    })
})
data

Downloading builder script:   0%|          | 0.00/4.58k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.87k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/57.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/572717 [00:00<?, ? examples/s]

Prepare 1st part of data: 100%|██████████| 399919/399919 [00:02<00:00, 142598.03it/s]
Prepare 2nd part of data: 100%|██████████| 572717/572717 [00:00<00:00, 2002769.02it/s]


DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 972636
    })
})

# Download model and tokenizer

In [4]:
checkpoint = "Mprimus/T5-translation"
tokenizer = T5Tokenizer.from_pretrained(checkpoint)
model = T5ForConditionalGeneration.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/833 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/828k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/416 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.27M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/977M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [5]:
prefix = "translate en-ru: "

def preprocess_function(examples):
    inputs = [prefix + doc["en"] for doc in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding=True)
    
    lebel_inputs = [doc["ru"] for doc in examples["translation"]]
    labels = tokenizer(lebel_inputs, max_length=128, truncation=True, padding=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [6]:
train_data = data['train'].train_test_split(test_size=0.2)

In [7]:
tokenized_train_data = train_data.map(preprocess_function, batched=True)

Map:   0%|          | 0/778108 [00:00<?, ? examples/s]

Map:   0%|          | 0/194528 [00:00<?, ? examples/s]

In [8]:
tokenized_train_data

DatasetDict({
    train: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 778108
    })
    test: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 194528
    })
})

In [9]:
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    b_result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    result = {key: value * 100 for key, value in result.items()}
    result["bleu"] = b_result["bleu"]
    
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [2]:
# Login to push result in huggingface
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Train model

In [10]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [11]:
training_args = Seq2SeqTrainingArguments(
    output_dir="T5-translation",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=80,
    per_device_eval_batch_size=80,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    push_to_hub=True,
    report_to="none",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data["train"],
    eval_dataset=tokenized_train_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Bleu,Gen Len
1,0.2536,0.207489,27.4428,16.9977,27.161,27.1588,0.2188,14.5482




TrainOutput(global_step=9727, training_loss=0.2556283271752686, metrics={'train_runtime': 11901.1663, 'train_samples_per_second': 65.381, 'train_steps_per_second': 0.817, 'total_flos': 1.3222715476672512e+17, 'train_loss': 0.2556283271752686, 'epoch': 1.0})

In [12]:
# Push model to huggingface
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/977M [00:00<?, ?B/s]

'https://huggingface.co/Mprimus/T5-translation/tree/main/'

# Make predictions

In [15]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_checkpoint = "Mprimus/T5-translation"
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

prefix = "translate en-ru: "

def translate(text, max_length=128):
    text = prefix + text
    model_inputs = tokenizer.encode(text, return_tensors="pt")
    outputs = model.generate(model_inputs, num_beams=2, max_new_tokens=max_length)
    
    res = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    
    return res[0]

In [16]:
translate("Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him.")

'Звезда Гарри Поттера Дэниел Рэдклифф получает доступ к известной сумме £20 млн ($41,1 млн), когда он будет 18 лет, но он настаивает, что деньги не окажут на него никакого влияния.'