**ТИНЬКОФФ МЛ СИРИУС ЛАГЕРЬ**

In [1]:
!pip install -U transformers datasets accelerate spacy bitsandbytes evaluate sentencepiece tokenizers torchinfo sacrebleu rouge_score peft jiwer wandb

Collecting transformers
  Downloading transformers-4.45.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.0-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting tokenizers
  Downloading tokenizers-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Prepa

In [2]:
import torch
from torchinfo import summary

from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, \
 T5ForConditionalGeneration, DataCollatorForSeq2Seq, \
  Seq2SeqTrainingArguments, Seq2SeqTrainer, default_data_collator
from datasets import load_dataset, load_from_disk, Dataset, concatenate_datasets, DatasetDict
import evaluate

import pandas as pd
import numpy as np

import json
import os
import re
np.random.seed(42)

In [3]:
import wandb

wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"]="best_model"

# save your trained model checkpoint to wandb
os.environ["WANDB_LOG_MODEL"]="false"

# turn off watch to log faster
os.environ["WANDB_WATCH"]="false"

In [5]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device

'cuda:0'

In [6]:
MAX_INSTRUCTION_LENGTH = 600
MAX_TARGET_LENGTH = 256

In [7]:
def task_choice():
  return np.random.choice(['aaqg', 'qg', 'qa'],
                          p=[0.4, 0.3, 0.3])

## Datasets Preparation

In [18]:
QA_PROMPT = "Сгенерируй ответ на вопрос по тексту. Текст: '{context}'. Вопрос: '{question}'."
AAQG_PROMPT = """AAQG | Текст: '{context}'. Ответ: '{answer}'"""
QG_PROMPT = """QG | Текст: '{context}'"""
QA_PROMPT = """QA | Текст: '{context}'. Вопрос: '{question}'"""
AAQG_BINARY_PROMPT = "Сгенерируй вопрос по тексту, используя известный бинарный ответ на него.\nТекст: {context}. Ответ: {answer}"

### Sberquad

In [9]:
from datasets import load_dataset

dataset = load_dataset("kuznetsoffandrey/sberquad")

train_dataset = load_dataset("kuznetsoffandrey/sberquad", split="train")
valid_dataset = load_dataset("kuznetsoffandrey/sberquad", split="validation")
test_dataset  = load_dataset("kuznetsoffandrey/sberquad", split="test")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/5.16k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/3.43M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/4.93M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/45328 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5036 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/23936 [00:00<?, ? examples/s]

In [10]:
def _generate_squad_examples(filepath):
  """This function returns the examples in the raw (text) form."""
  with open(filepath, encoding="utf-8") as f:
      squad = json.load(f)
      for article in squad["data"]:
          title = article.get("title", "")
          for paragraph in article["paragraphs"]:
              context = paragraph["context"]
              for qa in paragraph["qas"]:
                  answer_starts = [answer["answer_start"] for answer in qa["answers"]]
                  answers = [answer["text"] for answer in qa["answers"]]
                  yield {
                      "title": title,
                      "context": context,
                      "question": qa["question"],
                      "id": qa["id"],
                      "answers": {
                          "answer_start": answer_starts,
                          "text": answers,
                          },
                      }

In [11]:
sbersquad = DatasetDict({'train': train_dataset, 'validation': valid_dataset})
sbersquad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 45328
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 5036
    })
})

In [12]:
sbersquad['train'][0]

{'id': 62310,
 'title': 'SberChallenge',
 'context': 'В протерозойских отложениях органические остатки встречаются намного чаще, чем в архейских. Они представлены известковыми выделениями сине-зелёных водорослей, ходами червей, остатками кишечнополостных. Кроме известковых водорослей, к числу древнейших растительных остатков относятся скопления графито-углистого вещества, образовавшегося в результате разложения Corycium enigmaticum. В кремнистых сланцах железорудной формации Канады найдены нитевидные водоросли, грибные нити и формы, близкие современным кокколитофоридам. В железистых кварцитах Северной Америки и Сибири обнаружены железистые продукты жизнедеятельности бактерий.',
 'question': 'чем представлены органические остатки?',
 'answers': {'text': ['известковыми выделениями сине-зелёных водорослей'],
  'answer_start': [109]}}

In [13]:
def sberquad_process(row):
  task_type = task_choice()
  if task_type == 'aaqg':
    # prompt = np.random.choice(AAQG_PROMPTS)
    prompt = AAQG_PROMPT
    return {
        'task_type': task_type,
        'instruction': prompt.format(context=row['context'], answer=row['answers']['text'][0]),
        'target': row['question'],
        'source': 'sbersquad'
    }
  elif task_type == 'qg':
    # prompt = np.random.choice(QG_PROMPTS)
    prompt = QG_PROMPT
    return {
        'task_type': task_type,
        'instruction': prompt.format(context=row['context']),
        'target': row['question'],
        'source': 'sbersquad'
    }
  else:
    # prompt = np.random.choice(QA_PROMPTS)
    prompt = QA_PROMPT
    return {
        'task_type': task_type,
        'instruction': QA_PROMPT.format(context=row['context'], question=row['question']),
        'target': row['answers']['text'][0],
        'source': 'sbersquad'
    }

In [14]:

sbersquad = sbersquad.map(sberquad_process)
sbersquad

Map:   0%|          | 0/45328 [00:00<?, ? examples/s]

Map:   0%|          | 0/5036 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'task_type', 'instruction', 'target', 'source'],
        num_rows: 45328
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'task_type', 'instruction', 'target', 'source'],
        num_rows: 5036
    })
})

In [15]:
sbersquad = sbersquad.remove_columns(['title', 'context', 'question', 'id', 'answers'])
sbersquad

DatasetDict({
    train: Dataset({
        features: ['task_type', 'instruction', 'target', 'source'],
        num_rows: 45328
    })
    validation: Dataset({
        features: ['task_type', 'instruction', 'target', 'source'],
        num_rows: 5036
    })
})

## Model

In [27]:
#model_name = 'ai-forever/FRED-T5-large'
# model_name = 'IlyaGusev/saiga_mistral_7b_lora'
# model_name = 'ai-forever/FRED-T5-1.7B'
model_name = 'ai-forever/ruT5-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
model

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/20.4k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [17]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [18]:
print_trainable_parameters(model)

trainable params: 222903552 || all params: 222903552 || trainable%: 100.0


### Tokenization

In [19]:
def preprocess_function(batch):
  inputs = batch['instruction']
  targets = [text + tokenizer.eos_token for text in batch['target']]

  model_inputs = tokenizer(inputs, max_length=MAX_INSTRUCTION_LENGTH, truncation=True)

  labels = tokenizer(text_target=targets, max_length=MAX_TARGET_LENGTH, truncation=True)

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [21]:
sbersquad = sbersquad.map(
    preprocess_function,
    batched=True,
    num_proc=6,
    remove_columns=sbersquad['train'].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)
sbersquad

Running tokenizer on dataset (num_proc=6):   0%|          | 0/45328 [00:00<?, ? examples/s]

Running tokenizer on dataset (num_proc=6):   0%|          | 0/5036 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 45328
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5036
    })
})

In [22]:
np.quantile([len(x) for x in sbersquad['train']['input_ids']], q=0.97)

334.0

In [23]:
np.quantile([len(x) for x in sbersquad['train']['labels']], q=0.97)

30.0

### Training

In [24]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=-100
)
# data_collator = default_data_collator

In [25]:
blue_metric = evaluate.load("sacrebleu")
rouge_metric = evaluate.load("rouge")
chrf_metric = evaluate.load("chrf")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

In [26]:
chrf_metric.compute(predictions=["Some", "v", "of me"], references=["sme", "kBVwkv", "of me"], lowercase=True)

{'score': 43.01312680900858, 'char_order': 6, 'word_order': 0, 'beta': 2}

In [28]:
def postprocess_text(preds, labels):
  preds = [pred.strip() for pred in preds]
  labels = [label.strip() for label in labels]
  return preds, labels


def preprocess_logits_fix(logits, labels):
    """
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels


def compute_metrics(eval_preds):
  preds, labels = eval_preds
  if isinstance(preds, tuple):
      preds = preds[0]

  # preds, labels = preds.detach().cpu().numpy(), labels.detach().cpu().numpy()

  # Replace -100s used for padding as we can't decode them
  preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
  decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  # Some simple post-processing
  decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

  blue_result = blue_metric.compute(predictions=decoded_preds, references=decoded_labels)
  rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels,
                                      rouge_types=['rouge1', 'rouge2', 'rougeL'])
  chrf_result = chrf_metric.compute(predictions=decoded_preds, references=decoded_labels,
                                    lowercase=True)

  result = {"sbleu": blue_result["score"], "chr_f": chrf_result['score']}
  result = dict(list(result.items()) + list(rouge_result.items()))

  # prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
  # result["gen_len"] = np.mean(prediction_lens)
  result = {k: round(v, 4) for k, v in result.items()}

  del eval_preds, preds, labels, decoded_labels, decoded_preds

  return result

In [42]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./models",
    optim="adafactor",
    num_train_epochs=1, #в идеале 2 эпохи, но да ладно
    do_train=True,
    gradient_checkpointing=True,
    bf16=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=12,
    gradient_accumulation_steps=4,
    logging_dir="./logs",
    report_to="wandb",
    logging_steps=10,
    save_strategy="steps",
    save_steps=5000,
    evaluation_strategy="steps",
    eval_steps=300,
    learning_rate=3e-5,
    predict_with_generate=False,
    generation_max_length=64
)



In [2]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, \
 T5ForConditionalGeneration, DataCollatorForSeq2Seq, \
  Seq2SeqTrainingArguments, Seq2SeqTrainer, default_data_collator

In [43]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=sbersquad['train'],
    eval_dataset=sbersquad['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_fix
)

In [44]:
trainer.train()

  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss,Sbleu,Chr F,Rouge1,Rouge2,Rougel
300,1.072,1.025008,18.2064,62.3163,0.1104,0.0352,0.1098
600,1.0055,1.00753,18.5231,62.5647,0.1133,0.0365,0.1128
900,1.251,0.959073,18.869,63.0017,0.1151,0.0356,0.1146
1200,1.2478,0.944776,18.6563,62.8198,0.1154,0.0358,0.115


TrainOutput(global_step=1416, training_loss=1.1038680046291676, metrics={'train_runtime': 6877.7398, 'train_samples_per_second': 6.591, 'train_steps_per_second': 0.206, 'total_flos': 1.594497243635712e+16, 'train_loss': 1.1038680046291676, 'epoch': 0.9996470172961525})

In [45]:
trainer.save_model(f'/wandb/best_model{MAX_INSTRUCTION_LENGTH}_t{MAX_TARGET_LENGTH}')

In [57]:
import shutil
archive_path = f'/wandb/best_model{MAX_INSTRUCTION_LENGTH}_t{MAX_TARGET_LENGTH}'
shutil.make_archive(archive_path.replace('.zip', ''), 'zip', archive_path)
print(f"Архив сохранен по адресу: {archive_path}")

Архив сохранен по адресу: /wandb/best_model600_t256


In [None]:
wandb.finish()

## Tests

In [11]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [14]:
import torch
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [15]:
saved_checkpoint = f'/content/drive/MyDrive/best_model600_t256'
tokenizer = AutoTokenizer.from_pretrained(saved_checkpoint)
model = T5ForConditionalGeneration.from_pretrained(saved_checkpoint).to(device)

In [16]:
from functools import partial

def generate_text(prompt, tokenizer, model, n=1, temperature=0.8, num_beams=3):
  encoded_input = tokenizer.encode_plus(prompt, return_tensors='pt')
  encoded_input = {k: v.to(model.device) for k, v in encoded_input.items()}

  resulted_tokens = model.generate(**encoded_input,
                                   eos_token_id=2,
                                   max_new_tokens=64,
                                   do_sample=True,
                                   num_beams=num_beams,
                                   num_return_sequences=n,
                                   temperature=temperature,
                                   top_p=0.9,
                                   top_k=50)
  resulted_texts = tokenizer.batch_decode(resulted_tokens, skip_special_tokens=True)

  return resulted_texts

generate_text = partial(generate_text, tokenizer=tokenizer, model=model)

In [24]:
test_context = "Термин «физика» впервые фигурирует в сочинениях одного из величайших мыслителей древности — Аристотеля (IV век до нашей эры). Первоначально термины «физика» и «философия» были синонимами, так как в основе обеих дисциплин лежало стремление объяснить законы функционирования Вселенной. Однако в результате научной революции XVI века физика развилась в самостоятельную научную отрасль."

In [25]:
test_qa_prompt = QA_PROMPT.format(context=test_context,
                                  question='Кто первый придумал слово "физика"?')
test_qa_prompt

'QA | Текст: \'Термин «физика» впервые фигурирует в сочинениях одного из величайших мыслителей древности — Аристотеля (IV век до нашей эры). Первоначально термины «физика» и «философия» были синонимами, так как в основе обеих дисциплин лежало стремление объяснить законы функционирования Вселенной. Однако в результате научной революции XVI века физика развилась в самостоятельную научную отрасль.\'. Вопрос: \'Кто первый придумал слово "физика"?\''

In [26]:
test_answers = generate_text(test_qa_prompt, n=1)
test_answers

['Аристотель']