In [1]:
!pip install datasets transformers[torch] peft evaluate accelerate -q

In [8]:
import torch
import numpy as np
from datasets import load_dataset
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    TrainerCallback
)
import time
import psutil
import os

SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

# 1. Load Dataset / base metrics

In [None]:
dataset = load_dataset("dair-ai/emotion")
print(dataset)

print("\nПример строки:")
print(dataset['train'][0])

MODEL_NAME = "bert-base-uncased"
NUM_LABELS = len(dataset['train'].features['label'].names)
print(f"\nКлассы: {dataset['train'].features['label'].names}")
print(f"Количество классов: {NUM_LABELS}")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

tokenized_datasets.set_format("torch")

In [9]:
class ResourceProfilerCallback(TrainerCallback):
    def __init__(self):
        '''profiling trainings stats'''
        super().__init__()
        self.start_time = None
        self.start_mem = None
        self.process = psutil.Process(os.getpid())

    def on_train_begin(self, args, state, control, **kwargs):
        self.start_time = time.time()
        if torch.cuda.is_available():
            torch.cuda.reset_peak_memory_stats()
            self.start_mem = torch.cuda.memory_allocated()
        else:
            self.start_mem = self.process.memory_info().rss

    def on_train_end(self, args, state, control, **kwargs):
        self.end_time = time.time()
        if torch.cuda.is_available():
            self.end_mem = torch.cuda.max_memory_allocated()
        else:
            self.end_mem = self.process.memory_info().rss

def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [17]:
def compute_metrics(eval_pred):
    metrics = {}

    accuracy_metric = evaluate.load("accuracy")
    acc = accuracy_metric.compute(
        predictions=np.argmax(eval_pred.predictions, axis=1),
        references=eval_pred.label_ids
    )["accuracy"]
    metrics["accuracy"] = acc

    f1_metric = evaluate.load("f1")
    f1 = f1_metric.compute(
        predictions=np.argmax(eval_pred.predictions, axis=1),
        references=eval_pred.label_ids,
        average="weighted"
    )["f1"]
    metrics["f1"] = f1
    return metrics


model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    ignore_mismatched_sizes=True
)

training_args = TrainingArguments(
    output_dir="./tmp_eval",
    per_device_eval_batch_size=64,
    do_train=False,
    do_eval=True,
    seed=SEED,
    dataloader_drop_last=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)



print("Начало базовой оценки модели без дообучения...")
baseline_metrics = trainer.evaluate(tokenized_datasets["test"])

print("\nРезультаты базовой оценки:")
print(f"Test samples: {len(tokenized_datasets['test'])}")
print(f"Accuracy: {baseline_metrics['eval_accuracy']:.4f}")
print(f"Weighted F1: {baseline_metrics['eval_f1']:.4f}\n")

BASELINE_RESULTS = {
    "accuracy": baseline_metrics["eval_accuracy"],
    "f1": baseline_metrics["eval_f1"],
    "memory_usage": None,
    "params": sum(p.numel() for p in model.parameters())
}

if torch.cuda.is_available():
    memory_stats = torch.cuda.memory_stats()
    BASELINE_RESULTS["memory_usage"] = memory_stats["allocated_bytes.all.peak"] / 1024**3
    print(f"Пиковое использование GPU памяти: {BASELINE_RESULTS['memory_usage']:.2f} GB")
else:
    process = psutil.Process(os.getpid())
    BASELINE_RESULTS["memory_usage"] = process.memory_info().rss / 1024**3
    print(f"Пиковое использование RAM: {BASELINE_RESULTS['memory_usage']:.2f} GB")

  trainer = Trainer(


Начало базовой оценки модели без дообучения...



Результаты базовой оценки:
Test samples: 2000
Accuracy: 0.1360
Weighted F1: 0.0458

Пиковое использование GPU памяти: 1.49 GB


# 2. Full Finetuning

In [19]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    ignore_mismatched_sizes=True
)

training_args = TrainingArguments(
    output_dir="./full_finetuning",
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    learning_rate=2e-5,
    weight_decay=0.01,
    eval_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    save_strategy="no",
    seed=SEED,
    report_to="none",
    load_best_model_at_end=False,
    fp16=torch.cuda.is_available(),
)

profiler = ResourceProfilerCallback()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
    callbacks=[profiler]
)

print("Начало полного дообучения модели...")
start_time = time.time()
trainer.train()
training_time = time.time() - start_time

print("\nОценка на тестовых данных...")
test_metrics = trainer.evaluate(tokenized_datasets["test"])

FULL_FINETUNE_RESULTS = {
    "accuracy": test_metrics["eval_accuracy"],
    "f1": test_metrics["eval_f1"],
    "training_time": training_time,
    "params": sum(p.numel() for p in model.parameters() if p.requires_grad),
    "memory_usage": (profiler.end_mem - profiler.start_mem)/1024**3 if torch.cuda.is_available()
                    else (profiler.end_mem - profiler.start_mem)/1024**3
}

print("\nРезультаты полного дообучения:")
print(f"Время обучения: {FULL_FINETUNE_RESULTS['training_time']/60:.1f} минут")
print(f"Потребление памяти: {FULL_FINETUNE_RESULTS['memory_usage']:.2f} GB")
print(f"Обучаемые параметры: {FULL_FINETUNE_RESULTS['params']:,}")
print(f"Test Accuracy: {FULL_FINETUNE_RESULTS['accuracy']:.4f}")
print(f"Test F1: {FULL_FINETUNE_RESULTS['f1']:.4f}")

trainer.save_model("./full_finetuning/final_model")

Начало полного дообучения модели...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.24,0.217316,0.9175,0.918468
2,0.1324,0.153112,0.9335,0.93343
3,0.0915,0.155981,0.9385,0.93825



Оценка на тестовых данных...



Результаты полного дообучения:
Время обучения: 5.2 минут
Потребление памяти: 2.59 GB
Обучаемые параметры: 109,486,854
Test Accuracy: 0.9300
Test F1: 0.9293


# 3. Linear Probing

In [21]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    ignore_mismatched_sizes=True
)

for param in model.parameters():
    param.requires_grad = False

class CustomClassificationHead(torch.nn.Module):
    def __init__(self, hidden_size, num_labels, dropout_prob=0.1):
        super().__init__()
        self.dropout = torch.nn.Dropout(dropout_prob)
        self.linear = torch.nn.Linear(hidden_size, num_labels)

        torch.nn.init.xavier_normal_(self.linear.weight)
        self.linear.bias.data.zero_()

    def forward(self, features):
        x = self.dropout(features)
        return self.linear(x)

hidden_size = model.config.hidden_size
model.classifier = CustomClassificationHead(hidden_size, NUM_LABELS)

for param in model.classifier.parameters():
    param.requires_grad = True

trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Обучаемых параметров: {trainable_params}")

training_args = TrainingArguments(
    output_dir="./linear_probing",
    num_train_epochs=5,
    per_device_train_batch_size=128,
    learning_rate=1e-3,
    eval_strategy="epoch",
    logging_steps=30,
    seed=SEED,
    report_to="none",
    fp16=torch.cuda.is_available()
)

profiler = ResourceProfilerCallback()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
    callbacks=[profiler]
)

print("Начало обучения с Linear Probing...")
start_time = time.time()
trainer.train()
training_time = time.time() - start_time

test_metrics = trainer.evaluate(tokenized_datasets["test"])

LINEAR_RESULTS = {
    "accuracy": test_metrics["eval_accuracy"],
    "f1": test_metrics["eval_f1"],
    "training_time": training_time,
    "params": trainable_params,
    "memory_usage": (profiler.end_mem - profiler.start_mem)/1024**3
}

print("\nРезультаты Linear Probing:")
print(f"Время обучения: {LINEAR_RESULTS['training_time']/60:.1f} мин")
print(f"Память: {LINEAR_RESULTS['memory_usage']:.2f} GB")
print(f"Accuracy: {LINEAR_RESULTS['accuracy']:.4f}")
print(f"F1: {LINEAR_RESULTS['f1']:.4f}")

Обучаемых параметров: 4614
Начало обучения с Linear Probing...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.5777,1.506686,0.4425,0.331843
2,1.5393,1.475309,0.443,0.336091
3,1.5298,1.461126,0.484,0.372644
4,1.495,1.446199,0.476,0.365838
5,1.467,1.438907,0.48,0.368799



Результаты Linear Probing:
Время обучения: 2.8 мин
Память: 0.39 GB
Accuracy: 0.4695
F1: 0.3638


# 4. PEFT

In [23]:
from peft import PromptTuningConfig, get_peft_model

prompt_config = PromptTuningConfig(
    task_type="SEQ_CLS",
    num_virtual_tokens=10,
    token_dim=model.config.hidden_size,
    num_transformer_submodules=1
)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    ignore_mismatched_sizes=True
)
model = get_peft_model(model, prompt_config)

trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Обучаемых параметров: {trainable_params}\n")

training_args = TrainingArguments(
    output_dir="./prompt_tuning",
    num_train_epochs=10,
    per_device_train_batch_size=32,
    learning_rate=1e-4,
    eval_strategy="epoch",
    logging_steps=30,
    seed=SEED,
    report_to="none",
    fp16=torch.cuda.is_available()
)

profiler = ResourceProfilerCallback()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
    callbacks=[profiler]
)

print("Начало Prompt Tuning...")
start_time = time.time()
trainer.train()
training_time = time.time() - start_time

test_metrics = trainer.evaluate(tokenized_datasets["test"])

PROMPT_RESULTS = {
    "accuracy": test_metrics["eval_accuracy"],
    "f1": test_metrics["eval_f1"],
    "training_time": training_time,
    "params": trainable_params,
    "memory_usage": (profiler.end_mem - profiler.start_mem)/1024**3
}

print("\nРезультаты Prompt Tuning:")
print(f"Время обучения: {PROMPT_RESULTS['training_time']/60:.1f} мин")
print(f"Память: {PROMPT_RESULTS['memory_usage']:.2f} GB")
print(f"Accuracy: {PROMPT_RESULTS['accuracy']:.4f}")
print(f"F1: {PROMPT_RESULTS['f1']:.4f}")

Обучаемых параметров: 7680

Начало Prompt Tuning...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.8582,1.849769,0.102,0.08485
2,1.8473,1.817693,0.1825,0.151028
3,1.8162,1.791637,0.2735,0.180385
4,1.807,1.772064,0.324,0.191063
5,1.7857,1.759832,0.338,0.190736
6,1.7914,1.751523,0.346,0.189761
7,1.7703,1.746459,0.3485,0.188291
8,1.7678,1.743216,0.351,0.189033
9,1.7731,1.741363,0.352,0.187785
10,1.7686,1.740701,0.352,0.187785



Результаты Prompt Tuning:
Время обучения: 12.3 мин
Память: 1.31 GB
Accuracy: 0.3465
F1: 0.1822


# 5. LoRA

In [25]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query", "value"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS"
)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    ignore_mismatched_sizes=True
)
model = get_peft_model(model, lora_config)

trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Обучаемых параметров: {trainable_params}\n")

training_args = TrainingArguments(
    output_dir="./lora_tuning",
    num_train_epochs=4,
    per_device_train_batch_size=32,
    learning_rate=2e-5,
    eval_strategy="epoch",
    logging_steps=30,
    seed=SEED,
    report_to="none",
    fp16=torch.cuda.is_available()
)

profiler = ResourceProfilerCallback()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
    callbacks=[profiler]
)

print("Начало обучения с LoRA...")
start_time = time.time()
trainer.train()
training_time = time.time() - start_time

test_metrics = trainer.evaluate(tokenized_datasets["test"])

LORA_RESULTS = {
    "accuracy": test_metrics["eval_accuracy"],
    "f1": test_metrics["eval_f1"],
    "training_time": training_time,
    "params": trainable_params,
    "memory_usage": (profiler.end_mem - profiler.start_mem)/1024**3
}

print("\nРезультаты LoRA:")
print(f"Время обучения: {LORA_RESULTS['training_time']/60:.1f} мин")
print(f"Память: {LORA_RESULTS['memory_usage']:.2f} GB")
print(f"Accuracy: {LORA_RESULTS['accuracy']:.4f}")
print(f"F1: {LORA_RESULTS['f1']:.4f}")

Обучаемых параметров: 299526

Начало обучения с LoRA...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.5126,1.450815,0.481,0.366448
2,1.2903,1.204424,0.5555,0.433933
3,1.1793,1.153362,0.57,0.449521
4,1.1497,1.136958,0.574,0.452511



Результаты LoRA:
Время обучения: 5.1 мин
Память: 1.40 GB
Accuracy: 0.5840
F1: 0.4645


In [28]:
r_values = [4, 8, 16]
results = []

for r in r_values:
    lora_config = LoraConfig(
        r=r,
        lora_alpha=2*r,
        target_modules=["query", "value"],
        task_type="SEQ_CLS"
    )

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=NUM_LABELS,
        ignore_mismatched_sizes=True
    )
    model = get_peft_model(model, lora_config)

    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Обучаемых параметров: {trainable_params}\n")

    training_args = TrainingArguments(
        output_dir="./lora_tuning",
        num_train_epochs=4,
        per_device_train_batch_size=32,
        learning_rate=2e-5,
        eval_strategy="epoch",
        logging_steps=30,
        seed=SEED,
        report_to="none",
        fp16=torch.cuda.is_available()
    )

    profiler = ResourceProfilerCallback()

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        compute_metrics=compute_metrics,
        callbacks=[profiler]
    )

    print("Начало обучения с LoRA...")
    start_time = time.time()
    trainer.train()
    training_time = time.time() - start_time

    test_metrics = trainer.evaluate(tokenized_datasets["test"])

    RESULTS = {
        "accuracy": test_metrics["eval_accuracy"],
        "f1": test_metrics["eval_f1"],
        "training_time": training_time,
        "params": trainable_params,
        "memory_usage": (profiler.end_mem - profiler.start_mem)/1024**3
    }

    results.append({
        "r": r,
        "RESULTS": RESULTS
    })

Обучаемых параметров: 152070

Начало обучения с LoRA...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.5692,1.556132,0.375,0.25203
2,1.5207,1.471571,0.45,0.340274
3,1.3848,1.350873,0.5105,0.396351
4,1.3375,1.3258,0.515,0.40062


Обучаемых параметров: 299526

Начало обучения с LoRA...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.5533,1.532512,0.396,0.277798
2,1.3865,1.310184,0.5255,0.409806
3,1.2876,1.249239,0.544,0.426031
4,1.2623,1.2353,0.545,0.427395


Обучаемых параметров: 594438

Начало обучения с LoRA...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.4611,1.390125,0.499,0.384541
2,1.2873,1.207331,0.559,0.437578
3,1.1874,1.153918,0.5715,0.449954
4,1.1764,1.139394,0.5735,0.451351


Получаем, что параметры
```
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query", "value"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS"
)
```
дают лучший результат

# 6. Results

In [26]:
from tabulate import tabulate

results_table = [
    ["Method", "Accuracy", "F1", "Time (min)", "Params", "Memory (GB)"],
    ["Baseline (no tune)",
     f"{BASELINE_RESULTS['accuracy']:.3f}",
     f"{BASELINE_RESULTS['f1']:.3f}",
     "-",
     "-",
     f"{BASELINE_RESULTS['memory_usage']:.1f}"],

    ["Full Finetuning",
     f"{FULL_FINETUNE_RESULTS['accuracy']:.3f}",
     f"{FULL_FINETUNE_RESULTS['f1']:.3f}",
     f"{FULL_FINETUNE_RESULTS['training_time']/60:.1f}",
     f"{FULL_FINETUNE_RESULTS['params']/1e6:.1f}M",
     f"{FULL_FINETUNE_RESULTS['memory_usage']:.1f}"],

    ["Linear Probing",
     f"{LINEAR_RESULTS['accuracy']:.3f}",
     f"{LINEAR_RESULTS['f1']:.3f}",
     f"{LINEAR_RESULTS['training_time']/60:.1f}",
     f"{LINEAR_RESULTS['params']/1e3:.1f}K",
     f"{LINEAR_RESULTS['memory_usage']:.1f}"],

    ["Prompt Tuning",
     f"{PROMPT_RESULTS['accuracy']:.3f}",
     f"{PROMPT_RESULTS['f1']:.3f}",
     f"{PROMPT_RESULTS['training_time']/60:.1f}",
     f"{PROMPT_RESULTS['params']/1e3:.1f}K",
     f"{PROMPT_RESULTS['memory_usage']:.1f}"],

    ["LoRA (r=8)",
     f"{LORA_RESULTS['accuracy']:.3f}",
     f"{LORA_RESULTS['f1']:.3f}",
     f"{LORA_RESULTS['training_time']/60:.1f}",
     f"{LORA_RESULTS['params']/1e6:.1f}M",
     f"{LORA_RESULTS['memory_usage']:.1f}"]
]

print("\nСводная таблица результатов:")
print(tabulate(results_table, headers="firstrow", tablefmt="github", stralign="center"))


Сводная таблица результатов:
|       Method       |   Accuracy |    F1 |  Time (min)  |  Params  |   Memory (GB) |
|--------------------|------------|-------|--------------|----------|---------------|
| Baseline (no tune) |      0.136 | 0.046 |      -       |    -     |           1.5 |
|  Full Finetuning   |      0.93  | 0.929 |     5.2      |  109.5M  |           2.6 |
|   Linear Probing   |      0.469 | 0.364 |     2.8      |   4.6K   |           0.4 |
|   Prompt Tuning    |      0.346 | 0.182 |     12.3     |   7.7K   |           1.3 |
|     LoRA (r=8)     |      0.584 | 0.464 |     5.1      |   0.3M   |           1.4 |


Вывод:

 - Full Finetuning показал максимальное качество, что ожидаемо, так как все параметры модели адаптируются под задачу.
 - LoRA демонстрирует лучший баланс среди PEFT-методов, но его качество значительно уступает полному дообучению.
 - Prompt Tuning работает хуже Linear Probing, что противоречит ожиданиям.     
    Возможные причины:
    - Неоптимальные гиперпараметры (например, мало виртуальных токенов).
    - Специфика датасета (короткие тексты, слабая связь с промптами).
 - Baseline (no tune) подтверждает, что предобученная модель без дообучения не подходит для задачи классификации эмоций.