In [20]:
from transformers import AutoTokenizer

model_checkpoint = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



In [21]:
inputs = tokenizer("I loved reading the Hunger Games!")
inputs

{'input_ids': [336, 259, 28387, 11807, 287, 62893, 295, 12507, 309, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [22]:
tokenizer.convert_ids_to_tokens(inputs.input_ids)

['▁I', '▁', 'loved', '▁reading', '▁the', '▁Hung', 'er', '▁Games', '!', '</s>']

In [23]:
max_input_length = 512
max_target_length = 30


def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["text"],
        max_length=max_input_length,
        truncation=True,
    )
    labels = tokenizer(
        examples["title"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [24]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split

# Load the dataset
news_train = load_dataset('csv', data_files='../data/train.csv')
news_test = load_dataset('csv', data_files='../data/test.csv')
news_validate = load_dataset('csv', data_files='../data/validate.csv')

# Split the dataset into train and test sets


Generating train split: 6000 examples [00:00, 28738.02 examples/s]
Generating train split: 2000 examples [00:00, 25864.26 examples/s]
Generating train split: 2000 examples [00:00, 29277.26 examples/s]


In [25]:
def select_columns(example):
    return {'title': example['title'], 'text': example['text']}

selected = news_train.map(select_columns, remove_columns=['Unnamed: 0', 'url', 'topic', 'tags', 'date'])
selected_test = news_test.map(select_columns, remove_columns=['Unnamed: 0', 'url', 'topic', 'tags', 'date'])
selected_validate = news_validate.map(select_columns, remove_columns=['Unnamed: 0', 'url', 'topic', 'tags', 'date'])

Map: 100%|██████████| 6000/6000 [00:00<00:00, 14059.20 examples/s]
Map: 100%|██████████| 2000/2000 [00:00<00:00, 12637.23 examples/s]
Map: 100%|██████████| 2000/2000 [00:00<00:00, 14996.15 examples/s]


In [52]:
tokenized_train = selected.map(preprocess_function, batched=True)
tokenized_test = selected_test.map(preprocess_function, batched=True)
tokenized_validate = selected_validate.map(preprocess_function, batched=True)

In [50]:
tokenized_validate['train']

Dataset({
    features: ['title', 'text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 2000
})

In [53]:
from datasets import DatasetDict

tokenized = DatasetDict()


tokenized['train'] = tokenized_train['train']
tokenized['test'] = tokenized_test['train']
tokenized['validate'] = tokenized_validate['train']

In [54]:
tokenized


DatasetDict({
    train: Dataset({
        features: ['title', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 6000
    })
    test: Dataset({
        features: ['title', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
    validate: Dataset({
        features: ['title', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
})

Метрика
ROUGE

Recall = Number of over lapping words
 / Total number of words in reference summary

​


Precision = 
Number of over lapping words /
Total number of words in generated summary
​
 



In [28]:
import evaluate

rouge_score = evaluate.load("rouge")

In [29]:
generated_summary = "I absolutely loved reading the Hunger Games"
reference_summary = "I loved reading the Hunger Games"

In [30]:
scores = rouge_score.compute(
    predictions=[generated_summary], references=[reference_summary]
)
scores

{'rouge1': 0.923076923076923,
 'rouge2': 0.7272727272727272,
 'rougeL': 0.923076923076923,
 'rougeLsum': 0.923076923076923}

In [55]:
from nltk.tokenize import sent_tokenize


def three_sentence_summary(text):
    return "\n".join(sent_tokenize(text)[:3])


print(three_sentence_summary(tokenized['train']['text'][0]))

Американский бомбардировщик-невидимка F-117 "Nighthawk" вызвал неподдельный интерес посетителей авиасалона ILA-2000, открывшегося во вторник в Берлинском аэропорту Schoenefeld.
Русские тоже представили на салоне военные МиГ-29, совершив на них беспосадочный перелет со своих аэродромов, отмечает РИА "Новости".
Среди участников берлинского авиасалона 940 фирм из 38 стран мира, всего на нем представлено более трехсот новейших летательных аппаратов.


In [56]:
def evaluate_baseline(dataset, metric):
    summaries = [three_sentence_summary(text) for text in dataset["text"]]
    return metric.compute(predictions=summaries, references=dataset["text"])

In [57]:
import pandas as pd

score = evaluate_baseline(tokenized['train'], rouge_score)
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = dict((rn, round(score[rn] * 100, 2)) for rn in rouge_names)
rouge_dict

{'rouge1': 50.63, 'rouge2': 33.94, 'rougeL': 50.61, 'rougeLsum': 49.65}

Дообучение mT5 с API Trainer

In [15]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [58]:
from transformers import Seq2SeqTrainingArguments

batch_size = 8
num_train_epochs = 8
# Выводим потери при обучении по каждой эпохе
logging_steps = len(tokenized['train']) // batch_size
model_name = model_checkpoint.split("/")[-1]

args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name}-finetuned-amazon-en-es",
    evaluation_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    push_to_hub=True,
)

In [59]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Декодируем сгенерированные резюме в текст
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Заменяем -100 в метках, поскольку мы не можем их декодировать
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Декодируем эталонные резюме в текст
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE ожидает символ новой строки после каждого предложения
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Вычисляем оценки ROUGE
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Извлекаем медианные оценки
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [60]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [61]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validate"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-660c5f14-10a6d50c2a86996b6353918b;c71807c9-6322-4e3a-8ebd-34fd771c6da4)

Invalid username or password.