In [None]:
заметки:
- baseline нужен только как стартовая точка
- full обычно дает максимум качества, но дольше
- linear probing быстрый, но часто чуть хуже, посмотрим
- LoRA — компромисс по времени/качеству, тоже сравним метрики

Соревнование: https://huggingface.co/datasets/AlexSham/Toxic_Russian_Comments/

In [None]:
Обоснования:
- `ruBert-base` — адекватная база под RU, но тяжелая, возьмем tiny модельку, быстрее и с сохранением приемлемого качества
- метрики: accuracy + f1/precision/recall для дисбаланса
- для full и LoRA — сделаем маленький LR, большее тщательно обучаются; для линейной головы — lernin rate можно побольше

In [14]:
pip install --upgrade transformers accelerate

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting accelerate
  Downloading accelerate-1.12.0-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.12.0-py3-none-any.whl (380 kB)
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 1.1.1
    Uninstalling accelerate-1.1.1:
      Successfully uninstalled accelerate-1.1.1
Successfully installed accelerate-1.12.0
Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
autoawq 0.2.6 requires torch==2.3.1, but you have torch 2.7.1+cu118 which is incompatible.


In [1]:
# сначала подтянем либы и зафиксируем сиды, чтобы все воспроизводилось
# база
import os
import time
import random
import numpy as np
import pandas as pd
import torch
from datasets import ClassLabel

# датасеты + трансформеры (токенизация и тренер)
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    set_seed,
    TrainingArguments # непонятная ошибка в окружении, должно помочь
)

# метрики для классификации
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# LoRA
from peft import LoraConfig, TaskType, get_peft_model

# сиды — чтобы результаты были стабильнее
SEED = 2003
set_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# девайс и fp16, если доступно
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
USE_FP16 = torch.cuda.is_available()
print("device:", DEVICE)

# базовая модель под русский || простую руберт моя gpu не потянула бы, а в google collab потерял лимиты(
MODEL_NAME = "cointegrated/rubert-tiny2"


device: cuda


In [3]:
data_dir = os.path.abspath(".")
train_path = os.path.join(data_dir, "train.jsonl")
test_path = os.path.join(data_dir, "test.jsonl")

raw = load_dataset("json", data_files={"train": train_path, "test": test_path})

# метки классов
labels = raw["train"].unique("label")
labels = sorted(labels)  # желательно отсортировать для консистентности

# ClassLabel с этими метками
class_label = ClassLabel(names=labels)

# cast, чтобы заменить тип колонки label на ClassLabel
raw["train"] = raw["train"].cast_column("label", class_label)

# стратифицированный сплит
split = raw["train"].train_test_split(
    test_size=0.1,
    seed=SEED,
    stratify_by_column="label",
)
train_ds = split["train"]
val_ds = split["test"]
test_ds = raw["test"]

# баланс???
print(pd.Series(train_ds["label"]).value_counts(normalize=True))


0    0.820351
1    0.179649
Name: proportion, dtype: float64


In [4]:
# токенизируем тексты, дальше работаем уже с тензорами
# токенизатор под выбранную модель
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
max_length = 160

# функция токенизации батча
def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=max_length,
    )

# прогоняем все сплиты
train_tok = train_ds.map(tokenize_fn, batched=True, remove_columns=["text"])
val_tok = val_ds.map(tokenize_fn, batched=True, remove_columns=["text"])
test_tok = test_ds.map(tokenize_fn, batched=True, remove_columns=["text"])

# динамическая подгонка паддинга в батче
collator = DataCollatorWithPadding(tokenizer)

# приводим к torch-формату
train_tok.set_format("torch")
val_tok.set_format("torch")
test_tok.set_format("torch")

# смотрим, что получилось
print(train_tok[0])

tokenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/201114 [00:00<?, ? examples/s]

Map:   0%|          | 0/22347 [00:00<?, ? examples/s]

Map:   0%|          | 0/24829 [00:00<?, ? examples/s]

{'label': tensor(0), 'input_ids': tensor([    2,   312, 26629,  1619, 11352, 10164,     3]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1])}


In [5]:
# здесь метрики и фабрика тренера, чтобы не копипастить
# набор метрик для eval
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds),
        "precision": precision_score(labels, preds),
        "recall": recall_score(labels, preds),
    }


# общий конструктор тренера под разные режимы
def build_trainer(model, lr, output_dir, num_epochs=3):
    # тут все базовые гиперы, чтобы было одинаково в сравнениях
    args = TrainingArguments(
        output_dir=output_dir,
        learning_rate=lr,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=1,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        weight_decay=0.01,
        logging_steps=500,  # реже логгируем чтоб быстрее обучиться
        report_to="none",
        fp16=True,
        seed=SEED,
        dataloader_num_workers=4,
    )


    # сам тренер
    return Trainer(
        model=model,
        args=args,
        train_dataset=train_tok,
        eval_dataset=val_tok,
        tokenizer=tokenizer,
        data_collator=collator,
        compute_metrics=compute_metrics,
    )


# отдельный eval без обучения — для baseline
def eval_only(model, eval_dataset, tokenizer, collator, compute_metrics):
    args = TrainingArguments(
        output_dir="tmp_eval",
        per_device_eval_batch_size=16,
        report_to="none",
        do_train=False,
        do_eval=True,
    )
    trainer = Trainer(
        model=model,
        args=args,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=collator,
        compute_metrics=compute_metrics,
    )
    return trainer.evaluate()

In [7]:
# baseline без обучения — просто чтобы видеть стартовую точку
# загружаем модель с классификационной головой
baseline_model = AutoModelForSequenceClassification.from_pretrained(
    "cointegrated/rubert-tiny2",
    num_labels=2,
)
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")

baseline_metrics = eval_only(
    baseline_model,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    collator=collator,
    compute_metrics=compute_metrics,
)
print(baseline_metrics)

config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/118M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


{'eval_loss': 0.6966676115989685, 'eval_model_preparation_time': 0.002, 'eval_accuracy': 0.46444712936859534, 'eval_f1': 0.25498007968127495, 'eval_precision': 0.16997261183500706, 'eval_recall': 0.5100871731008717, 'eval_runtime': 26.5633, 'eval_samples_per_second': 841.272, 'eval_steps_per_second': 52.591}


In [9]:
# full finetuning — трогаем все веса
# свежая модель под обучение
full_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
)

# тренер для полного дообучения
full_trainer = build_trainer(full_model, lr=2e-5, output_dir="runs/full")

# меряем время обучения
start = time.time()
full_trainer.train()
full_time = time.time() - start

# финальные метрики
full_metrics = full_trainer.evaluate()
full_metrics["train_time_sec"] = full_time
full_metrics

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1071,0.116595,0.971495,0.92082,0.919107,0.92254


{'eval_loss': 0.116595059633255,
 'eval_accuracy': 0.9714950552646888,
 'eval_f1': 0.9208203853325047,
 'eval_precision': 0.919106699751861,
 'eval_recall': 0.9225404732254048,
 'eval_runtime': 129.2843,
 'eval_samples_per_second': 172.852,
 'eval_steps_per_second': 21.611,
 'epoch': 1.0,
 'train_time_sec': 1634.379587650299}

In [11]:
# linear probing — база заморожена, учим только голову
# загружаем модель
lp_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
)

# замораживаем все кроме классификатора
for p in lp_model.base_model.parameters():
    p.requires_grad = False

# тренер с повышенным LR (обучается только голова)
lp_trainer = build_trainer(lp_model, lr=5e-4, output_dir="runs/linear_probing")

# время и обучение
start = time.time()
lp_trainer.train()
lp_time = time.time() - start

# финальная оценка
lp_metrics = lp_trainer.evaluate()
lp_metrics["train_time_sec"] = lp_time
lp_metrics

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2657,0.264719,0.888576,0.646307,0.752066,0.566625


{'eval_loss': 0.26471850275993347,
 'eval_accuracy': 0.8885756477379514,
 'eval_f1': 0.6463068181818183,
 'eval_precision': 0.7520661157024794,
 'eval_recall': 0.5666251556662516,
 'eval_runtime': 129.134,
 'eval_samples_per_second': 173.053,
 'eval_steps_per_second': 21.636,
 'epoch': 1.0,
 'train_time_sec': 949.4691407680511}

In [17]:
# LoRA — быстрый вариант: учим адаптеры, не все веса
lora_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
)

# конфиг: какие слои и насколько сильно крутим
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    target_modules=["query", "value"],
)

lora_model = get_peft_model(lora_model, lora_config)

# LR чуть больше, чем в full — тут обучаем мало параметров
lora_trainer = build_trainer(lora_model, lr=2e-4, output_dir="runs/lora")

start = time.time()
lora_trainer.train()
lora_time = time.time() - start

lora_metrics = lora_trainer.evaluate()
lora_metrics["train_time_sec"] = lora_time
lora_metrics

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: cannot import name 'shard_checkpoint' from 'transformers.modeling_utils' (C:\Users\user\anaconda3\Lib\site-packages\transformers\modeling_utils.py)

In [None]:
# По какой-то причине не получилось лору использовать, несовместимость библиотек

In [15]:
import transformers
import peft

print(transformers.__version__)
print(peft.__version__)


4.57.6
0.18.1


In [21]:
# сводим метрики в табличку для честного сравнения
# helper, чтобы не писать одно и то же
def metrics_to_row(name, m):
    return {
        "model": name,
        "accuracy": m.get("eval_accuracy"),
        "f1": m.get("eval_f1"),
        "precision": m.get("eval_precision"),
        "recall": m.get("eval_recall"),
        "train_time_sec": m.get("train_time_sec"),
    }
# собираем все результаты в один список
rows = [
    metrics_to_row("baseline", {"eval_accuracy": baseline_metrics["eval_accuracy"],
                                "eval_f1": baseline_metrics["eval_f1"],
                                "eval_precision": baseline_metrics["eval_precision"],
                                "eval_recall": baseline_metrics["eval_recall"],
                                "train_time_sec": None}),
    metrics_to_row("full", full_metrics),
    metrics_to_row("linear_probing", lp_metrics)
    # metrics_to_row("lora", lora_metrics),
]
# сортируем по f1
pd.DataFrame(rows).sort_values("f1", ascending=False)

Unnamed: 0,model,accuracy,f1,precision,recall,train_time_sec
1,full,0.971495,0.92082,0.919107,0.92254,1634.379588
2,linear_probing,0.888576,0.646307,0.752066,0.566625,949.469141
0,baseline,0.464447,0.25498,0.169973,0.510087,


full модель показывает лучший результат. Особенно сильно в лучшую сторону отличаются precision and recall. 
baseline - случайный результат, что естественно
lora - не получилось обучить из-за проблем бесконечных с библиотеками

In [25]:
# для интереса вытаскиваем историю лоссов/оценок
# вытаскиваем логи конкретного тренера
def extract_log_history(trainer, name):
    rows = []
    for h in trainer.state.log_history:
        if "loss" in h or "eval_loss" in h:
            rows.append({"model": name, **h})
    return pd.DataFrame(rows)

# объединяем историю из всех запусков
logs = pd.concat([
    extract_log_history(full_trainer, "full"),
    extract_log_history(lp_trainer, "linear_probing")
    # extract_log_history(lora_trainer, "lora"),
], ignore_index=True)

# смотрим хвост
logs.tail(10)

Unnamed: 0,model,loss,grad_norm,learning_rate,epoch,step,eval_loss,eval_accuracy,eval_f1,eval_precision,eval_recall,eval_runtime,eval_samples_per_second,eval_steps_per_second
94,linear_probing,0.2665,0.252617,7.3e-05,0.855211,21500,,,,,,,,
95,linear_probing,0.2712,0.485519,6.3e-05,0.875099,22000,,,,,,,,
96,linear_probing,0.2886,0.259065,5.3e-05,0.894988,22500,,,,,,,,
97,linear_probing,0.2799,0.450012,4.3e-05,0.914877,23000,,,,,,,,
98,linear_probing,0.2872,0.483303,3.3e-05,0.934765,23500,,,,,,,,
99,linear_probing,0.285,0.211944,2.3e-05,0.954654,24000,,,,,,,,
100,linear_probing,0.2776,0.275093,1.3e-05,0.974543,24500,,,,,,,,
101,linear_probing,0.2657,0.371518,3e-06,0.994431,25000,,,,,,,,
102,linear_probing,,,,1.0,25140,0.264719,0.888576,0.646307,0.752066,0.566625,138.7585,161.05,20.136
103,linear_probing,,,,1.0,25140,0.264719,0.888576,0.646307,0.752066,0.566625,129.134,173.053,21.636
