In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.2-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 5.1 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 55.9 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.10.0-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 72.0 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.0 tokenizers-0.12.1 transformers-4.22.2


In [2]:
import pandas as pd
DATASET_PATH = '/content/dataset.jsonl'

with open(DATASET_PATH) as f: 
    df = pd.read_json(DATASET_PATH, lines=True).set_index('id')

In [3]:
df.head()

Unnamed: 0_level_0,date,rating,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2004-08-30 11:24:00+00:00,22010.0,"<Ares> ppdv, все юниксы очень дружелюбны.. они..."
2,2004-08-30 11:25:00+00:00,25105.0,<томатик_рад> а ты не чувствуешь красоту мира?...
3,2004-08-30 11:27:00+00:00,7192.0,"<Дор> ""мышка, почему у тебя такие большие глаз..."
4,2004-08-30 11:28:00+00:00,29169.0,"<PPDV[os2]> ""Мальчики, вы что больные, бегать ..."
5,2004-08-30 11:26:00+00:00,7140.0,<Ohtori_Akio> мы - как разработчики - живём с ...


In [4]:
data = df.loc[:5000, 'text']

In [5]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.15)

In [6]:
import re

def build_text_files(data_json, dest_path):
    f = open(dest_path, 'w')
    data = ''
    for texts in data_json:
        summary = str(texts).strip()
        summary = re.sub(r"\[\w+\]", "", summary)
        summary = re.sub(r"<[\w+,\!, -]>", "", summary)
        summary = re.sub(r"<\w+>", "", summary)
        summary = re.sub(r"\s", " ", summary)
        data += summary + "  "
    f.write(data)
  
build_text_files(train,'train_dataset.txt')
build_text_files(test,'test_dataset.txt')

In [7]:
with open('train_dataset.txt') as f: 
    train_dataset = f.read()

In [8]:
print("Train dataset length: "+ str(len(train)))
print("Test dataset length: "+ str(len(test)))

Train dataset length: 1197
Test dataset length: 212


In [9]:
from transformers import AutoTokenizer
model_name = "sberbank-ai/rugpt3small_based_on_gpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/608 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
train_path = 'train_dataset.txt'
test_path = 'test_dataset.txt'

In [11]:
from transformers import TextDataset, DataCollatorForLanguageModeling

#создает тренировочный и тестовый датасет
def load_dataset(train_path, test_path, tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)

    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset, test_dataset, data_collator

train_dataset, test_dataset, data_collator = load_dataset(train_path, test_path, tokenizer)



In [12]:
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(model_name)

Downloading:   0%|          | 0.00/551M [00:00<?, ?B/s]

In [13]:
training_args = TrainingArguments(
    output_dir="gdrive/MyDrive/GPT/gpt2-chief", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=4, # batch size for training
    per_device_eval_batch_size=4,  # batch size for evaluation
    eval_steps = 400, # Number of update steps between two evaluations.
    save_steps=800, # after # steps model is saved
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    )

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [15]:
trainer.train()

***** Running training *****
  Num examples = 469
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 354


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=354, training_loss=4.386708060226871, metrics={'train_runtime': 68.1747, 'train_samples_per_second': 20.638, 'train_steps_per_second': 5.193, 'total_flos': 91909472256000.0, 'train_loss': 4.386708060226871, 'epoch': 3.0})

In [16]:
trainer.save_model()

Saving model checkpoint to gdrive/MyDrive/GPT/gpt2-chief
Configuration saved in gdrive/MyDrive/GPT/gpt2-chief/config.json
Model weights saved in gdrive/MyDrive/GPT/gpt2-chief/pytorch_model.bin


In [17]:
tokenizer.save_pretrained('/content/gpt2-chief')

model.save_pretrained('/content/model_gpt_chf')

tokenizer config file saved in /content/gpt2-chief/tokenizer_config.json
Special tokens file saved in /content/gpt2-chief/special_tokens_map.json
Configuration saved in /content/model_gpt_chf/config.json
Model weights saved in /content/model_gpt_chf/pytorch_model.bin


In [18]:
tokenizer = AutoTokenizer.from_pretrained("/content/gpt2-chief")

model1 = AutoModelForCausalLM.from_pretrained("/content/model_gpt_chf")

loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file /content/model_gpt_chf/config.json
Model config GPT2Config {
  "_name_or_path": "/content/model_gpt_chf",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 2048,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 2048,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": tr

In [19]:
!pip install corus
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting corus
  Downloading corus-0.9.0-py3-none-any.whl (83 kB)
[K     |████████████████████████████████| 83 kB 1.8 MB/s 
[?25hInstalling collected packages: corus
Successfully installed corus-0.9.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.5.2-py3-none-any.whl (432 kB)
[K     |████████████████████████████████| 432 kB 5.4 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 68.1 MB/s 
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 70.8 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  D

In [20]:
!wget https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.1/lenta-ru-news.csv.bz2

--2022-10-06 15:49:28--  https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.1/lenta-ru-news.csv.bz2
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/87156914/619f9f00-1e96-11ea-946e-dac89df8aced?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20221006%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20221006T154929Z&X-Amz-Expires=300&X-Amz-Signature=90b48e88987d6b4d7b5087c5ac1190dc770ec7c6cf382039fba6c9bfd23aa474&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=87156914&response-content-disposition=attachment%3B%20filename%3Dlenta-ru-news.csv.bz2&response-content-type=application%2Foctet-stream [following]
--2022-10-06 15:49:29--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/87156914/619f9f00-1e96-11ea-946e-da

In [21]:
from corus import load_lenta2

path = 'lenta-ru-news.csv.bz2'
records = load_lenta2(path)
next(records)

LentaRecord(
    url='https://lenta.ru/news/1914/09/16/hungarnn/',
    title='1914. Русские войска вступили в\xa0пределы Венгрии  ',
    text='Бои у Сопоцкина и Друскеник закончились отступлением германцев. Неприятель, приблизившись с севера к Осовцу начал артиллерийскую борьбу с крепостью. В артиллерийском бою принимают участие тяжелые калибры. С раннего утра 14 сентября огонь достиг значительного напряжения. Попытка германской пехоты пробиться ближе к крепости отражена. В Галиции мы заняли Дембицу. Большая колонна, отступавшая по шоссе от Перемышля к Саноку, обстреливалась с высот нашей батареей и бежала, бросив парки, обоз и автомобили. Вылазки гарнизона Перемышля остаются безуспешными. При продолжающемся отступлении австрийцев обнаруживается полное перемешивание их частей, захватываются новые партии пленных, орудия и прочая материальная часть. На перевале Ужок мы разбили неприятельский отряд, взяли его артиллерию и много пленных и, продолжая преследовать, вступили в пределы Венгрии

In [22]:
data = [(record.title, record.text) for record in records]

In [23]:
import pandas as pd
df_news = pd.DataFrame({'title': [record[0] for record in data], 'text': [record[1] for record in data]})
df_news.head()

Unnamed: 0,title,text
0,1914. Празднование столетия М.Ю. Лермонтова от...,"Министерство народного просвещения, в виду про..."
1,1914. Das ist Nesteroff!,"Штабс-капитан П. Н. Нестеров на днях, увидев в..."
2,1914. Бульдог-гонец под Льежем,Фотограф-корреспондент Daily Mirror рассказыва...
3,1914. Под Люблином пойман швабский зверь,"Лица, приехавшие в Варшаву из Люблина, передаю..."
4,"Космонавты сомневаются в надежности ""Мира""",Как стало известно агентству Ассошиэйтед Пресс...


In [24]:
max(len(i) for i in df_news['text'].values), max(len(i) for i in df_news['title'].values)

(55386, 132)

In [25]:
data_news = df_news[:500]

In [26]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(data_news, test_size=0.15)

In [27]:
from datasets import Dataset
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

dataset_train = Dataset.from_pandas(df_train)
dataset_test = Dataset.from_pandas(df_test)
dataset_train, dataset_test

(Dataset({
     features: ['title', 'text'],
     num_rows: 425
 }), Dataset({
     features: ['title', 'text'],
     num_rows: 75
 }))

In [28]:
from transformers import AutoTokenizer

model_name = "IlyaGusev/rut5_base_sum_gazeta"
tokenizer = AutoTokenizer.from_pretrained(model_name)

max_len_txt = 400
max_len_tlt = 50

def tokenize(batch):
    tokenized_input = tokenizer(batch['text'], padding='max_length', truncation=True, max_length=max_len_txt)
    tokenized_label = tokenizer(batch['title'], padding='max_length', truncation=True, max_length=max_len_tlt)
    tokenized_input['labels'] = tokenized_label['input_ids']

    return tokenized_input

dataset_train = dataset_train.map(tokenize, batched=True, batch_size=8)
dataset_test = dataset_test.map(tokenize, batched=True, batch_size=8)

dataset_train.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])
dataset_test.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])

Downloading:   0%|          | 0.00/279 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/828k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

loading file spiece.model from cache at /root/.cache/huggingface/hub/models--IlyaGusev--rut5_base_sum_gazeta/snapshots/f09a08cae5d74c70e55da1a6ebb49f88c26f433b/spiece.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--IlyaGusev--rut5_base_sum_gazeta/snapshots/f09a08cae5d74c70e55da1a6ebb49f88c26f433b/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--IlyaGusev--rut5_base_sum_gazeta/snapshots/f09a08cae5d74c70e55da1a6ebb49f88c26f433b/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--IlyaGusev--rut5_base_sum_gazeta/snapshots/f09a08cae5d74c70e55da1a6ebb49f88c26f433b/tokenizer_config.json


  0%|          | 0/54 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

In [29]:
dataset_train, dataset_test

(Dataset({
     features: ['title', 'text', 'input_ids', 'attention_mask', 'labels'],
     num_rows: 425
 }), Dataset({
     features: ['title', 'text', 'input_ids', 'attention_mask', 'labels'],
     num_rows: 75
 }))

In [30]:
dataset_train.save_to_disk('gazeta/train')
dataset_test.save_to_disk('gazeta/test')

In [31]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments
model_name = "IlyaGusev/rut5_base_sum_gazeta"
model = T5ForConditionalGeneration.from_pretrained(model_name)

Downloading:   0%|          | 0.00/766 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--IlyaGusev--rut5_base_sum_gazeta/snapshots/f09a08cae5d74c70e55da1a6ebb49f88c26f433b/config.json
Model config T5Config {
  "_name_or_path": "cointegrated/rut5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "bos_token_id": 2,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 2,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "max_length": 200,
  "model_type": "t5",
  "num_beams": 5,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "tokenizer_class": "T5Tokenizer",
  "torch_dtype": "float32",
  "transform

Downloading:   0%|          | 0.00/977M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--IlyaGusev--rut5_base_sum_gazeta/snapshots/f09a08cae5d74c70e55da1a6ebb49f88c26f433b/pytorch_model.bin
All model checkpoint weights were used when initializing T5ForConditionalGeneration.

All the weights of T5ForConditionalGeneration were initialized from the model checkpoint at IlyaGusev/rut5_base_sum_gazeta.
If your task is similar to the task the model of the checkpoint was trained on, you can already use T5ForConditionalGeneration for predictions without further training.


In [32]:
output_dir = 'gazeta/output'
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    save_steps=1000, 
    remove_unused_columns=True, 
    eval_steps=500
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [33]:
from datasets import load_from_disk
dataset_train = load_from_disk("gazeta/train")
dataset_test = load_from_disk("gazeta/test")
dataset_train, dataset_test

(Dataset({
     features: ['title', 'text', 'input_ids', 'attention_mask', 'labels'],
     num_rows: 425
 }), Dataset({
     features: ['title', 'text', 'input_ids', 'attention_mask', 'labels'],
     num_rows: 75
 }))

In [34]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test
)

trainer.train()

***** Running training *****
  Num examples = 425
  Num Epochs = 3
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 639
The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: text, title. If text, title are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.


Step,Training Loss
500,4.1775




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=639, training_loss=3.380409216843487, metrics={'train_runtime': 211.7534, 'train_samples_per_second': 6.021, 'train_steps_per_second': 3.018, 'total_flos': 677081548800000.0, 'train_loss': 3.380409216843487, 'epoch': 3.0})

In [35]:
trainer.save_model(output_dir + '/model')

Saving model checkpoint to gazeta/output/model
Configuration saved in gazeta/output/model/config.json
Model weights saved in gazeta/output/model/pytorch_model.bin


In [36]:
import torch
#тестовые данные
INX = 10
input_text = dataset_test['text'][INX]
input_title = dataset_test['title'][INX]

with torch.no_grad():
    tokenized_text = tokenizer(input_text, truncation=True, padding=True, return_tensors='pt')

    source_ids = tokenized_text['input_ids'].to(dtype = torch.long)
    source_mask = tokenized_text['attention_mask'].to(dtype = torch.long)

    generated_ids = model.generate(
        input_ids = source_ids,
        attention_mask = source_mask, 
        max_length=512,
        num_beams=7,
        temperature = 1.3,
        repetition_penalty=1, 
        length_penalty=1, 
        early_stopping=True,
        no_repeat_ngram_size=2
    )

    pred = tokenizer.decode(generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

print("Text:\n" + input_text)
print("Real title: " + input_title)
print("Pred title: " + pred)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


RuntimeError: ignored