# Библиотеки

In [None]:
import torch
#from tqdm import tqdm
import torchvision
import random
import numpy as np
from IPython.display import clear_output

try:
    import transformers
except ModuleNotFoundError:
    %pip install transformers
    %pip install transformers sentencepiece --quiet
    clear_output()
    import transformers
from transformers import (
    T5ForConditionalGeneration, T5Tokenizer,
    TrainingArguments, Trainer,
)


import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding



Фиксируем сиды

In [None]:
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)
#torch.use_deterministic_algorithms(True)

Пути для сохранения - измените и не забудьте подключить ваш диск:

In [None]:
path = '/home/jupyter/datasphere/project'
saved_model_path = path + '/archive'
checkpoint_path = path + '/t5-model-small'
logs_path = checkpoint_path + '/log'
dataset_path_1 = path + '/dataset'
dataset_path_2 = path + '/ruarxiv'
pretrained_model = saved_model_path + '/model_t5_small_5_7.pth'
raw_model_name = "cointegrated/rut5-small"

Устройство ускорителя:

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Функции:

In [None]:
def save(path, model, optimizer):
    '''
    Сохранить модель и оптимизатор в path
    '''
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, path)

def summarize(text, model, tokenizer, max_length_text=2890, max_length_ref=500):
    '''
    Генерация реферата

    Возвращает сгенерированный моделью реферат (str)
    '''
    inp = tokenizer(text, add_special_tokens=True, max_length=max_length_text, padding="max_length", truncation=True,return_tensors='pt').to(device)
    return tokenizer.decode(model.generate(input_ids=inp.input_ids, attention_mask=inp.attention_mask, max_length=max_length_ref)[0], skip_special_tokens=True)

def read_dataset(tokenizer, ex = [], n=100, title=True, dataset_path = dataset_path_1):
    '''
    Создать датасет для обучения (для чтения из директории)

    Возвращает преобразованный датасет (list)
    '''
    dataset = []
    txt_i = 1 if title else 0
    sum_i = txt_i+1
    for i in range(n):
        if i not in ex:
          with open(dataset_path + f'/{i}.txt', 'r', encoding='utf-8') as f:
              data = f.read()
          data = data.split('\n\n')
          txt = tokenizer(data[txt_i], max_length=5800, padding="max_length", add_special_tokens=True)
          sum_ = tokenizer(data[sum_i], add_special_tokens=True).input_ids
          txt["labels"] = sum_
          dataset.append(txt)
    return dataset

def train_model(model, tokenizer, optimizer,
                train_dataset, val_dataset,
                num_steps=1, num_epochs=1, model_num=10, step_=0):

    for step in range(1, num_steps+1):
        # Тренер и параметры
        if val_dataset is None:
            training_args = TrainingArguments(
                output_dir= checkpoint_path,
                overwrite_output_dir=True,
                per_device_train_batch_size=1,
                num_train_epochs=num_epochs,
                warmup_steps=10,
                gradient_accumulation_steps=16,
                save_strategy="epoch",
                seed=42,
            )

            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_dataset,
                tokenizer=tokenizer,
                optimizers = (optimizer, None)
            )
        else:
            training_args = TrainingArguments(
                output_dir= checkpoint_path,
                overwrite_output_dir=True,
                per_device_train_batch_size=1,
                per_device_eval_batch_size=1,
                num_train_epochs=num_epochs,
                warmup_steps=10,
                gradient_accumulation_steps=16,
                evaluation_strategy="no",
                save_strategy="epoch",
                seed=42,
            )

            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=val_dataset,
                tokenizer=tokenizer,
                optimizers = (optimizer, None)
            )

        # Обучение модели
        model.train()
        logs_train = trainer.train()
        logs_eval = None
        if val_dataset is not None:
            model.eval()
            logs_eval = trainer.evaluate()

        print('Saving model')
        save(saved_model_path + f'/model_t5_small_{model_num}_{step+step_}.pth', model, optimizer)

        print('Saving loss')
        with open(logs_path + f'/loss_dictionary.txt','a') as f:
            f.write(f'Step_{step+step_}\nTRAIN LOG:\n{logs_train}\nEVAL LOG:\n{logs_eval}\n\n')

Токенизатор:

In [None]:
tokenizer = T5Tokenizer.from_pretrained(raw_model_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


# Дообучение

## Загрузка RuArxiv:

In [None]:
train_set = []
#lens1, lens2 = [], []
for i in range(400):
          with open(dataset_path_2 + f'/{i}.txt', 'r', encoding='utf-8') as f:
              data = f.read()
          data = data.split('\n\n')
          txt = tokenizer(data[0], add_special_tokens=True).input_ids
          sum_ = tokenizer(data[1], add_special_tokens=True).input_ids
          len_t, len_s = len(txt), len(sum_)
          if len_t <= 12000 and len_s <= 1000 and len_s / len_t < 0.5:

            txt = tokenizer(data[0], max_length=12000, padding="max_length", truncation=True, add_special_tokens=True)
            sum_ = tokenizer(data[1], max_length=1000, padding="max_length", truncation=True, add_special_tokens=True).input_ids
            txt["labels"] = sum_
            train_set.append(txt)
            '''
            lens1.append(len_t)
            lens2.append(len_s)
            '''

In [None]:
'''
print(max(lens1))
print(max(lens2))
'''

In [None]:
print(len(train_set))

268


In [None]:
val_set = train_set [241:]
train_set = train_set [:241]

## Gazeta + RuArxiv

In [None]:
pretrained_model = saved_model_path + '/model_t5_small_8_1.pth'

In [None]:
model_t5 = T5ForConditionalGeneration.from_pretrained(raw_model_name)
optimizer = torch.optim.AdamW(model_t5.parameters(),lr=1e-4)
checkpoint = torch.load(pretrained_model, map_location='cpu')
model_t5.load_state_dict(checkpoint['model_state_dict'])
model_t5.to(device)
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

You are using a model of type mt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


In [None]:
num_steps = 9
model_num = 8

In [None]:
train_model(model_t5, tokenizer, optimizer,
            train_set, val_set,
            num_steps=num_steps, model_num = model_num, step_ = 1)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
100%|██████████| 15/15 [08:32<00:00, 34.17s/it]


{'train_runtime': 512.5329, 'train_samples_per_second': 0.47, 'train_steps_per_second': 0.029, 'train_loss': 0.8105343500773112, 'epoch': 1.0}


100%|██████████| 27/27 [00:12<00:00,  2.25it/s]


Saving model
Saving loss


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
100%|██████████| 15/15 [08:32<00:00, 34.15s/it]


{'train_runtime': 512.2573, 'train_samples_per_second': 0.47, 'train_steps_per_second': 0.029, 'train_loss': 0.776827875773112, 'epoch': 1.0}


100%|██████████| 27/27 [00:12<00:00,  2.25it/s]

Saving model





Saving loss


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
100%|██████████| 15/15 [08:33<00:00, 34.23s/it]


{'train_runtime': 513.484, 'train_samples_per_second': 0.469, 'train_steps_per_second': 0.029, 'train_loss': 0.7515774408976237, 'epoch': 1.0}


100%|██████████| 27/27 [00:12<00:00,  2.25it/s]

Saving model





Saving loss


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
100%|██████████| 15/15 [08:33<00:00, 34.23s/it]


{'train_runtime': 513.4811, 'train_samples_per_second': 0.469, 'train_steps_per_second': 0.029, 'train_loss': 0.7299936294555665, 'epoch': 1.0}


100%|██████████| 27/27 [00:12<00:00,  2.25it/s]

Saving model





Saving loss


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
100%|██████████| 15/15 [08:32<00:00, 34.20s/it]


{'train_runtime': 512.9357, 'train_samples_per_second': 0.47, 'train_steps_per_second': 0.029, 'train_loss': 0.7103700637817383, 'epoch': 1.0}


100%|██████████| 27/27 [00:12<00:00,  2.24it/s]

Saving model





Saving loss


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
100%|██████████| 15/15 [08:33<00:00, 34.25s/it]


{'train_runtime': 513.8095, 'train_samples_per_second': 0.469, 'train_steps_per_second': 0.029, 'train_loss': 0.6917633056640625, 'epoch': 1.0}


100%|██████████| 27/27 [00:12<00:00,  2.25it/s]

Saving model





Saving loss


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
100%|██████████| 15/15 [08:32<00:00, 34.20s/it]


{'train_runtime': 512.9423, 'train_samples_per_second': 0.47, 'train_steps_per_second': 0.029, 'train_loss': 0.6738756179809571, 'epoch': 1.0}


100%|██████████| 27/27 [00:12<00:00,  2.25it/s]


Saving model
Saving loss


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
100%|██████████| 15/15 [08:32<00:00, 34.20s/it]


{'train_runtime': 512.9465, 'train_samples_per_second': 0.47, 'train_steps_per_second': 0.029, 'train_loss': 0.6561537424723307, 'epoch': 1.0}


100%|██████████| 27/27 [00:12<00:00,  2.25it/s]


Saving model
Saving loss


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
100%|██████████| 15/15 [08:33<00:00, 34.20s/it]


{'train_runtime': 512.9995, 'train_samples_per_second': 0.47, 'train_steps_per_second': 0.029, 'train_loss': 0.6388845443725586, 'epoch': 1.0}


100%|██████████| 27/27 [00:12<00:00,  2.25it/s]


Saving model
Saving loss


## Загрузка RuSciText

In [None]:
train_set_ = []
#lens1, lens2 = [], []
for i in range(100):
          with open(dataset_path_1 + f'/{i}.txt', 'r', encoding='utf-8') as f:
              data = f.read()
          data = data.split('\n\n')
          '''
          txt = tokenizer(data[1], add_special_tokens=True).input_ids
          sum_ = tokenizer(data[2], add_special_tokens=True).input_ids
          len_t, len_s = len(txt), len(sum_)
          '''
          txt = tokenizer(data[1], max_length=6000, padding="max_length", truncation=True, add_special_tokens=True)
          sum_ = tokenizer(data[2], max_length=700, padding="max_length", truncation=True, add_special_tokens=True).input_ids
          txt["labels"] = sum_
          train_set_.append(txt)
          '''
          lens1.append(len_t)
          lens2.append(len_s)
          '''

In [None]:
'''
print(max(lens1))
print(max(lens2))
'''

5791
692


In [None]:
#train_set_ = read_dataset(tokenizer, n=100, title=True, dataset_path = dataset_path_1)
val_set_ = train_set_ [90:]
train_set_ = train_set_ [:90]

In [None]:
train_set = train_set + train_set_
val_set = val_set + val_set_
random.shuffle(train_set)
random.shuffle(val_set)

In [None]:
print(len(train_set))

331


## RuSciText + RuArxiv

In [None]:
model_t5 = T5ForConditionalGeneration.from_pretrained(raw_model_name)
optimizer = torch.optim.AdamW(model_t5.parameters(),lr=1e-4)

You are using a model of type mt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


In [None]:
pretrained_model = saved_model_path + '/model_t5_small_0_1.pth'

In [None]:
model_t5 = T5ForConditionalGeneration.from_pretrained(raw_model_name)
optimizer = torch.optim.AdamW(model_t5.parameters(),lr=1e-4)
checkpoint = torch.load(pretrained_model, map_location='cpu')
model_t5.load_state_dict(checkpoint['model_state_dict'])
model_t5.to(device)
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

You are using a model of type mt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


In [None]:
num_steps = 7
model_num = 2

In [None]:
train_model(model_t5, tokenizer, optimizer,
            train_set, val_set,
            num_steps=num_steps, model_num = model_num, step_ = 0)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
100%|██████████| 20/20 [09:05<00:00, 27.25s/it]


{'train_runtime': 545.0336, 'train_samples_per_second': 0.607, 'train_steps_per_second': 0.037, 'train_loss': 14.041815185546875, 'epoch': 0.97}


100%|██████████| 37/37 [00:13<00:00,  2.77it/s]

Saving model





Saving loss


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
100%|██████████| 20/20 [09:06<00:00, 27.31s/it]


{'train_runtime': 546.2618, 'train_samples_per_second': 0.606, 'train_steps_per_second': 0.037, 'train_loss': 8.813500213623048, 'epoch': 0.97}


100%|██████████| 37/37 [00:13<00:00,  2.77it/s]

Saving model





Saving loss


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
100%|██████████| 20/20 [09:06<00:00, 27.31s/it]


{'train_runtime': 546.1266, 'train_samples_per_second': 0.606, 'train_steps_per_second': 0.037, 'train_loss': 5.195806503295898, 'epoch': 0.97}


100%|██████████| 37/37 [00:13<00:00,  2.77it/s]

Saving model





Saving loss


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
100%|██████████| 20/20 [09:05<00:00, 27.30s/it]


{'train_runtime': 545.9683, 'train_samples_per_second': 0.606, 'train_steps_per_second': 0.037, 'train_loss': 3.7309722900390625, 'epoch': 0.97}


100%|██████████| 37/37 [00:13<00:00,  2.77it/s]

Saving model





Saving loss


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
100%|██████████| 20/20 [09:05<00:00, 27.29s/it]


{'train_runtime': 545.8192, 'train_samples_per_second': 0.606, 'train_steps_per_second': 0.037, 'train_loss': 2.511781120300293, 'epoch': 0.97}


100%|██████████| 37/37 [00:13<00:00,  2.77it/s]

Saving model





Saving loss


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
100%|██████████| 20/20 [09:06<00:00, 27.33s/it]


{'train_runtime': 546.5568, 'train_samples_per_second': 0.606, 'train_steps_per_second': 0.037, 'train_loss': 1.6202228546142579, 'epoch': 0.97}


100%|██████████| 37/37 [00:13<00:00,  2.77it/s]

Saving model





Saving loss


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
100%|██████████| 20/20 [09:05<00:00, 27.30s/it]


{'train_runtime': 545.9917, 'train_samples_per_second': 0.606, 'train_steps_per_second': 0.037, 'train_loss': 1.3082743644714356, 'epoch': 0.97}


100%|██████████| 37/37 [00:13<00:00,  2.77it/s]

Saving model





Saving loss


In [None]:
train_model(model_t5, tokenizer, optimizer,
            train_set, val_set,
            num_steps=5, model_num = model_num, step_ = 7)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
 45%|████▌     | 9/20 [03:50<04:49, 26.29s/it]

## Gazeta + RuSciText

In [None]:
train_set_ = read_dataset(tokenizer, n=100, title=True, dataset_path = dataset_path_1)
val_set_ = train_set_ [90:]
train_set_ = train_set_ [:90]

In [None]:
model_t5 = T5ForConditionalGeneration.from_pretrained(raw_model_name)
optimizer = torch.optim.AdamW(model_t5.parameters(),lr=1e-4)
checkpoint = torch.load(pretrained_model, map_location='cpu')
model_t5.load_state_dict(checkpoint['model_state_dict'])
model_t5.to(device)
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

You are using a model of type mt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


In [None]:
num_steps = 10
model_num = 7

In [None]:
train_model(model_t5, tokenizer, optimizer,
            train_set_, val_set_,
            num_steps=num_steps, model_num = model_num, step_= 0)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
 20%|██        | 1/5 [02:44<10:56, 164.19s/it]

  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:10<00:41, 10.44s/it][A
 40%|████      | 2/5 [00:20<00:31, 10.44s/it][A
 60%|██████    | 3/5 [00:31<00:20, 10.40s/it][A
 80%|████████  | 4/5 [00:41<00:10, 10.37s/it][A
                                             [A
 20%|██        | 1/5 [01:49<00:24,  6.22s/it]
100%|██████████| 5/5 [00:56<00:00, 11.31s/it][A


{'train_runtime': 56.5225, 'train_samples_per_second': 1.592, 'train_steps_per_second': 0.088, 'train_loss': 2.8034946441650392, 'epoch': 0.89}



  0%|          | 0/10 [00:00<?, ?it/s][A
 20%|██        | 2/10 [00:00<00:00, 10.69it/s][A
 40%|████      | 4/10 [00:00<00:00,  6.51it/s][A
 50%|█████     | 5/10 [00:00<00:00,  6.01it/s][A
 60%|██████    | 6/10 [00:00<00:00,  5.72it/s][A
 70%|███████   | 7/10 [00:01<00:00,  5.54it/s][A
 80%|████████  | 8/10 [00:01<00:00,  5.37it/s][A
 90%|█████████ | 9/10 [00:01<00:00,  5.29it/s][A
100%|██████████| 10/10 [00:01<00:00,  5.23it/s][A

Saving model





Saving loss


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)

  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:10<00:41, 10.44s/it][A
 40%|████      | 2/5 [00:20<00:31, 10.44s/it][A
 60%|██████    | 3/5 [00:31<00:20, 10.40s/it][A
 80%|████████  | 4/5 [00:41<00:10, 10.37s/it][A

KeyboardInterrupt: 

## Gazeta + RuArxiv + RuSciText

In [None]:
train_set_ = read_dataset(tokenizer, n=100, title=True, dataset_path = dataset_path_1)
val_set_ = train_set_ [90:]
train_set_ = train_set_ [:90]

In [None]:
train_set = train_set + train_set_
val_set = val_set + val_set_
random.shuffle(train_set)
random.shuffle(val_set)

In [None]:
print(len(train_set))
print(len(val_set))

In [None]:
pretrained_model = saved_model_path + '/model_t5_small_9_1.pth'

In [None]:
model_t5 = T5ForConditionalGeneration.from_pretrained(raw_model_name)
optimizer = torch.optim.AdamW(model_t5.parameters(),lr=1e-4)
checkpoint = torch.load(pretrained_model, map_location='cpu')
model_t5.load_state_dict(checkpoint['model_state_dict'])
model_t5.to(device)
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

You are using a model of type mt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


In [None]:
num_steps = 9
model_num = 9

In [None]:
train_model(model_t5, tokenizer, optimizer,
            train_set, val_set,
            num_steps=num_steps, model_num = model_num, step_ = 1)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
100%|██████████| 20/20 [09:04<00:00, 27.25s/it]


{'train_runtime': 544.9545, 'train_samples_per_second': 0.607, 'train_steps_per_second': 0.037, 'train_loss': 0.8365849494934082, 'epoch': 0.97}


100%|██████████| 37/37 [00:13<00:00,  2.77it/s]

Saving model





Saving loss


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
100%|██████████| 20/20 [09:06<00:00, 27.32s/it]


{'train_runtime': 546.4798, 'train_samples_per_second': 0.606, 'train_steps_per_second': 0.037, 'train_loss': 0.8028703689575195, 'epoch': 0.97}


100%|██████████| 37/37 [00:13<00:00,  2.77it/s]

Saving model





Saving loss


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
100%|██████████| 20/20 [09:06<00:00, 27.32s/it]


{'train_runtime': 546.378, 'train_samples_per_second': 0.606, 'train_steps_per_second': 0.037, 'train_loss': 0.7759933948516846, 'epoch': 0.97}


100%|██████████| 37/37 [00:13<00:00,  2.77it/s]

Saving model





Saving loss


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
100%|██████████| 20/20 [09:06<00:00, 27.31s/it]


{'train_runtime': 546.1563, 'train_samples_per_second': 0.606, 'train_steps_per_second': 0.037, 'train_loss': 0.7520785331726074, 'epoch': 0.97}


100%|██████████| 37/37 [00:13<00:00,  2.77it/s]

Saving model





Saving loss


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
100%|██████████| 20/20 [09:05<00:00, 27.30s/it]


{'train_runtime': 545.9824, 'train_samples_per_second': 0.606, 'train_steps_per_second': 0.037, 'train_loss': 0.7300347805023193, 'epoch': 0.97}


100%|██████████| 37/37 [00:13<00:00,  2.77it/s]

Saving model





Saving loss


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
100%|██████████| 20/20 [09:06<00:00, 27.32s/it]


{'train_runtime': 546.4317, 'train_samples_per_second': 0.606, 'train_steps_per_second': 0.037, 'train_loss': 0.7088441848754883, 'epoch': 0.97}


100%|██████████| 37/37 [00:13<00:00,  2.77it/s]

Saving model





Saving loss


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
 15%|█▌        | 3/20 [01:16<07:00, 24.76s/it]

KeyboardInterrupt: 

# Запускать при ошибке

Если возникла ошибка при запуске обучения, запустите ячейку и перезапустите ноутбук:

In [None]:

#if "Using the `Trainer` with `PyTorch` requires `accelerate`: Run `pip install --upgrade accelerate`" occured:
%pip uninstall -y transformers accelerate
%pip install transformers accelerate
clear_output()
# then reload notebook


# Тестирование

In [None]:
try:
    import datasets
except ModuleNotFoundError:
    %pip install datasets
    clear_output()
from datasets import load_dataset

try:
    import rouge
except ModuleNotFoundError:
    %pip install rouge
    clear_output()
from rouge import Rouge

try:
    import nltk
except ModuleNotFoundError:
    %pip install nltk
    clear_output()
from nltk.translate.bleu_score import corpus_bleu

try:
    import evaluate
except ModuleNotFoundError:
    %pip install evaluate
    %pip install bert_score
    clear_output()
from evaluate import load

from torch.utils.data import DataLoader

In [None]:
rouge = Rouge()
meteor = load('meteor')
bertscore = load("bertscore")

Downloading builder script: 100%|██████████| 6.93k/6.93k [00:00<00:00, 4.63MB/s]
[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jupyter/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Downloading builder script: 100%|██████████| 7.95k/7.95k [00:00<00:00, 4.39MB/s]


In [None]:
def tests_res(data, model, tokenizer, max_length_text=2890, max_length_ref=500):
    '''
    Генерация рефератов для тестирования модели

    Возвращает результат модели на датасете data (list) и эталонные рефераты (list)
    '''
    res, ref = [], []
    for i, inst in enumerate(data):
        if i%150 == 0:
            with open(logs_path + f'/step.txt', 'w') as f:
                f.write(f'step: {i}')
            with open(logs_path + f'/results.txt', 'w') as f:
                f.write('\n\n'.join(res))
        res.append(summarize(inst['text'], model, tokenizer, max_length_ref = max_length_ref))
        ref.append(inst['summary'])
    with open(logs_path + f'/step.txt', 'w') as f:
        f.write(f'step: {i}')
    with open(logs_path + f'/results.txt', 'w') as f:
        f.write('\n\n'.join(res))
    return res, ref

def testing(num_steps, model_num, max_length_ref=500):
    pretrained_model = saved_model_path + f'/model_t5_small_{model_num}_{num_steps}.pth'
    model_t5 = T5ForConditionalGeneration.from_pretrained(raw_model_name)
    optimizer = torch.optim.AdamW(model_t5.parameters(),lr=1e-4)
    checkpoint = torch.load(pretrained_model, map_location='cpu')
    model_t5.load_state_dict(checkpoint['model_state_dict'])
    model_t5.to(device)
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    model_t5.eval()
    print('Testing')
    model_results, refs = tests_res(dataset_test, model_t5, tokenizer, max_length_ref=max_length_ref) # результаты работы модели

    # Оценка ROUGE
    print('Done! Counting ROUGE')
    scores = rouge.get_scores(model_results, refs, avg=True)
    print(scores)

    # Оценка BLEU
    print('Done! Counting BLEU')
    blue = corpus_bleu([[r.split(" ")] for r in refs], [hyp.split(" ") for hyp in model_results])
    print(blue)

    # Оценка METEOR
    print('Done! Counting METEOR')
    results_m = meteor.compute(predictions=model_results, references=refs)
    print(results_m)

    # Оценка BertScore
    print('Done! Counting BertScore')
    results_b = bertscore.compute(predictions=model_results, references=refs, lang="ru")
    results_b = {k: np.mean(v) for k, v in list(results_b.items())[:-1]}
    print(results_b)

    print('Saving scores')
    with open(logs_path + f'/metrics.txt', 'a') as f:
        f.write(f'MODEL: {model_num}_{num_steps}\n')
        f.write(f'ROUGE: {scores}\n')
        f.write(f'BLEU: {blue}\n')
        f.write(f'METEOR: {results_m}\n')
        f.write(f'BertScore: {results_b}\n\n')

In [None]:
# Подгрузка датасета
dataset_test = load_dataset('IlyaGusev/gazeta', split='test[4822:]', revision="v2.0")
clear_output()

In [None]:
num_steps = 12
model_num = 2
testing(num_steps, model_num)

You are using a model of type mt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


Testing


In [None]:
num_steps = 14
model_num = 7
testing(num_steps, model_num, max_length_ref=100)

You are using a model of type mt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


Testing


In [None]:
num_steps = 8
model_num = 8
testing(num_steps, model_num)

In [None]:
num_steps = 4
model_num = 9
testing(num_steps, model_num)