## Установка и импорты библиотек

In [3]:
%pip install datasets peft huggingface_hub -q

  pid, fd = os.forkpty()


Note: you may need to restart the kernel to use updated packages.


In [23]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [24]:
import torch
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments, AutoModel, BertTokenizerFast, AutoModelForSequenceClassification
from datasets import load_dataset, DatasetDict, Dataset
from peft import LoraConfig, get_peft_model

## Считывание данных

In [25]:
import pandas as pd
df = pd.read_csv('/kaggle/input/recommendation-possibility-dataset3/recommendation_possibility_dataset.csv', encoding='utf-8')

In [26]:
df.head()

Unnamed: 0,answer,label
0,"да, могу рекомендовать",1
1,Не все руководители токсичны и газлайтят + в о...,2
2,У меня нет информации по другим вакансиям и ст...,0
3,Остались положительные впечатления от работы в...,1
4,Политика Гринатома мне очень импанирует. Корпо...,1


In [27]:
df.shape

(992, 2)

In [28]:
train_texts = list(df['answer'].values)
train_labels = list(df['label'].values)

## Разбиение на обучающую и тестовую выборку

In [29]:
from sklearn.model_selection import train_test_split

# Разделяем данные на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(
    train_texts, train_labels, test_size=0.2, random_state=42
)

train_texts = X_train
train_labels = y_train

val_texts = X_train[:100]
val_labels = y_train[:100]

test_texts = X_test
test_labels = y_test

## Обработка датасета

In [30]:
# Проверяем доступность GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Используем устройство: {device}')

Используем устройство: cuda


In [31]:
from sklearn.metrics import f1_score, accuracy_score
import numpy as np

# Функция для вычисления метрик, включая F1-score
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    f1 = f1_score(labels, preds, average='weighted')  # Считаем взвешенный F1-score для многоклассовой классификации
    acc = accuracy_score(labels, preds)  # Дополнительно считаем точность
    return {
        'accuracy': acc,
        'f1': f1
    }

# Создаем обучающий и валидационный датасеты
train_dataset = Dataset.from_dict({
    'text': train_texts,
    'label': train_labels
})

val_dataset = Dataset.from_dict({
    'text': val_texts,
    'label': val_labels
})

test_dataset = Dataset.from_dict({
    'text': test_texts,
    'label': test_labels
})

#### Токенизация датасета

In [32]:
# Загружаем токенизатор
tokenizer = BertTokenizerFast.from_pretrained('blanchefort/rubert-base-cased-sentiment-rusentiment')

# Токенизация датасета
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

# Токенизируем датасеты
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
# Токенизация тестовой выборки
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/793 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/199 [00:00<?, ? examples/s]

## Загрузка модели

In [33]:
model = AutoModelForSequenceClassification.from_pretrained('blanchefort/rubert-base-cased-sentiment-rusentiment', return_dict=True)

# Перемещение модели на устройство
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

## Проверка качества базовой модели

In [34]:
def predict(text, model, tokenizer):
    inputs = tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors='pt').to(device)
    outputs = model(**inputs)
    predicted = torch.nn.functional.softmax(outputs.logits, dim=1)
    predicted = torch.argmax(predicted, dim=1).cpu().numpy()
    return predicted

def number_to_string(value):
    mapping = {
      0: "NEUTRAL",
      1: "POSITIVE",
      2: "NEGATIVE"
    }
    return mapping.get(value, "Invalid number")

def get_model_class_answer(text: str, model, tokenizer) -> str:
    predicted_class_label = predict(text, model, tokenizer)[0]
    return number_to_string(predicted_class_label)          

In [35]:
from sklearn.metrics import accuracy_score, f1_score

# Функция для оценки качества модели на тестовой выборке
def evaluate_model(test_dataset, model, tokenizer):
    model.eval()  # Переводим модель в режим оценки
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in test_dataset:
            inputs = tokenizer(batch['text'], max_length=512, padding=True, truncation=True, return_tensors='pt').to(device)
            outputs = model(**inputs)
            predicted = torch.nn.functional.softmax(outputs.logits, dim=1)
            predicted = torch.argmax(predicted, dim=1).cpu().numpy()
            predictions.extend(predicted)
            true_labels.extend([batch['label']])  # Замените 'label' на название вашего поля с истинными метками

    # Рассчитываем accuracy и f1-score
    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions, average='weighted')  # Можно использовать 'macro' или 'micro' в зависимости от ваших нужд

    return accuracy, f1

In [36]:
# Оценка модели на тестовой выборке
accuracy, f1 = evaluate_model(test_dataset, model, tokenizer)

print(f'Accuracy: {accuracy:.4f}')
print(f'F1 Score: {f1:.4f}')

Accuracy: 0.5729
F1 Score: 0.5998


## Настройка метода LoRA для обучения

In [37]:
# Конфигурация LoRA
lora_config = LoraConfig(
    r=16,              # Rank параметр
    lora_alpha=32,   # Lora Alpha параметр
    lora_dropout=0.1, # Дропаут в LoRA
    target_modules=["query", "key", "value"],  # Слои, к которым применяется LoRA
    bias="none",      # Можно использовать 'all', 'lora_only' или 'none'
    task_type="SEQ_CLS"  # Тип задачи
)

In [38]:
# Применение LoRA к модели
fine_tuned_model = get_peft_model(model, lora_config)

In [39]:
# Инициализация счётчиков
total_params = 0
trainable_params = 0

# Замораживание всех слоёв, кроме LoRA слоёв
for name, param in model.named_parameters():
    total_params += param.numel()  # Считаем общее количество параметров
    if 'lora' not in name:
        param.requires_grad = False  # Замораживаем все параметры, кроме LoRA
    else:
        #print(f"Разморожен параметр {name}")
        trainable_params += param.numel()  # Считаем количество незамороженных параметров

# Вывод результата
print(f"Всего параметров: {total_params}")
print(f"Незамороженных параметров: {trainable_params}")
print(f"Процент незамороженных параметров: {trainable_params / total_params * 100:.2f}%")


Всего параметров: 178742790
Незамороженных параметров: 884736
Процент незамороженных параметров: 0.49%


## Дообучение

In [40]:
# Аргументы для тренировки
training_args = TrainingArguments(
    output_dir='/kaggle/working/staying_possibility_results',          # Директория для сохранения модели
    evaluation_strategy="epoch",     # Оценка на каждом эпохе
    learning_rate=2e-5,              # Скорость обучения
    per_device_train_batch_size=1,   # Размер батча для тренировки
    per_device_eval_batch_size=1,    # Размер батча для валидации
    num_train_epochs=10,              # Количество эпох
    weight_decay=0.01,               # Регуляризация веса
    logging_dir='/kaggle/working/logs',            # Директория для логов
    logging_steps=10,                # Шаги логгирования
    fp16=True,                       # Использовать смешанную точность (если поддерживается)
    save_strategy="no"
)

# Обучение модели с использованием Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Запуск обучения
trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0292,0.34937,0.88,0.877246
2,0.5727,0.275298,0.91,0.909508
3,0.0292,0.201135,0.94,0.939262
4,0.0258,0.185162,0.95,0.949617
5,0.634,0.166188,0.96,0.959991
6,0.3002,0.132893,0.96,0.959722
7,0.1911,0.109524,0.96,0.959722
8,0.407,0.099424,0.96,0.959722
9,0.0061,0.089101,0.96,0.959722
10,0.014,0.084217,0.97,0.969631


TrainOutput(global_step=7930, training_loss=0.3061989908110148, metrics={'train_runtime': 824.286, 'train_samples_per_second': 9.62, 'train_steps_per_second': 9.62, 'total_flos': 2108098621624320.0, 'train_loss': 0.3061989908110148, 'epoch': 10.0})

## Проверка на тестовой выборке

In [41]:
# Оценка модели на тестовой выборке
accuracy, f1 = evaluate_model(test_dataset, fine_tuned_model, tokenizer)

print(f'Accuracy: {accuracy:.4f}')
print(f'F1 Score: {f1:.4f}')

Accuracy: 0.9196
F1 Score: 0.9179


In [129]:
# Проверка модели на тестовой выборке
test_results = trainer.evaluate(test_dataset)

# Вывод результатов
# print(f"Результаты на тестовой выборке: {test_results}")
print("Test:")
print(test_results.get('eval_accuracy'))
print(test_results.get('eval_f1'))

Test:
0.8286026200873362
0.8255951215250679


## Тестовый инференс модели

In [None]:
def predict(text):
    inputs = tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors='pt').to(device)
    outputs = model(**inputs)
    predicted = torch.nn.functional.softmax(outputs.logits, dim=1)
    predicted = torch.argmax(predicted, dim=1).cpu().numpy()
    return predicted

def number_to_string(value):
    mapping = {
      0: "NEUTRAL",
      1: "POSITIVE",
      2: "NEGATIVE"
    }
    return mapping.get(value, "Invalid number")

def get_model_class_answer(text: str) -> str:
    predicted_class_label = predict(text)[0]
    return number_to_string(predicted_class_label)

In [25]:
get_model_class_answer('На данный момент я сосредоточен на поиске новой работы')

'NEGATIVE'

In [26]:
get_model_class_answer('Пока не думал(а) об этом.')

'NEUTRAL'

In [27]:
get_model_class_answer('Хотел бы.')

'POSITIVE'

## Сохранение модели

In [145]:
# Предположим, что вы обучили модель, теперь её нужно сохранить
model.save_pretrained("/kaggle/working/returning_possibility")
tokenizer.save_pretrained("/kaggle/working/returning_possibility")

('/kaggle/working/returning_possibility/tokenizer_config.json',
 '/kaggle/working/returning_possibility/special_tokens_map.json',
 '/kaggle/working/returning_possibility/vocab.txt',
 '/kaggle/working/returning_possibility/added_tokens.json',
 '/kaggle/working/returning_possibility/tokenizer.json')

## Загрузка адаптера в репозиторий Hugging Face

In [146]:
import shutil

# Указываем путь к директории, которую нужно архивировать
directory_to_archive = '/kaggle/working/returning_possibility'

# Имя итогового архива (без расширения)
archive_name = '/kaggle/working/returning_possibility_adapter_model'

# Указываем формат архива ('zip', 'tar', 'gztar', 'bztar', 'xztar')
format = 'zip'

# Создаем архив
shutil.make_archive(archive_name, format, directory_to_archive)

print(f'Архив {archive_name}.{format} успешно создан.')


Архив /kaggle/working/returning_possibility_adapter_model.zip успешно создан.


In [150]:
from IPython.display import FileLink
FileLink(r'returning_possibility_adapter_model.zip')

In [32]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [31]:
!huggingface-cli login

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


usage: huggingface-cli <command> [<args>]
huggingface-cli: error: unrecognized arguments: -y


In [51]:
!huggingface-cli repo create sirius_hack_staying_possibility -y

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[90mgit version 2.34.1[0m
[90mgit-lfs/3.0.2 (GitHub; linux amd64; go 1.18.1)[0m

You are about to create [1mVzvorygin/sirius_hack_staying_possibility[0m

Your repo now lives at:
  [1mhttps://huggingface.co/Vzvorygin/sirius_hack_staying_possibility[0m

You can clone it locally with the command below, and commit/push as usual.

  git clone https://huggingface.co/Vzvorygin/sirius_hack_staying_possibility



In [52]:
!git config --global credential.helper store

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [53]:
!git config --global user.email "cay108@yandex.ru"
!git config --global user.name "WocherZ"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [39]:
%cd staying_possibility

/kaggle/working/staying_possibility


In [57]:
%cd ..

/content


In [40]:
!pwd

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


/kaggle/working/staying_possibility


In [42]:
!ls

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


README.md		   special_tokens_map.json  vocab.txt
adapter_config.json	   tokenizer.json
adapter_model.safetensors  tokenizer_config.json


In [41]:
!git lfs install

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Git LFS initialized.


In [54]:
!git lfs track "*.bin"
!git lfs track "*.safetensors"
!git add .gitattributes
!huggingface-cli lfs-enable-largefiles .

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


"*.bin" already supported


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


"*.safetensors" already supported


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Local repo set up for largefiles


In [46]:
!git add adapter_model.safetensors

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [47]:
!git lfs migrate import --everything

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


migrate: override changes in your working copy?  All uncommitted changes will be lost! [y/N] ^C
migrate: working copy must not be dirty


In [60]:
%pwd
!git init
!git remote add origin https://huggingface.co/Vzvorygin/sirius_hack_staying_possibility
!git add *
!git add adapter_model.safetensors
!git commit -m "Initial commit"
!git pull --rebase origin main
!git push origin main

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Reinitialized existing Git repository in /kaggle/working/staying_possibility/.git/


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


error: remote origin already exists.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


On branch main
Last command done (1 command done):
   pick 4d4a249 Initial commit
No commands remaining.
You are currently editing a commit while rebasing branch 'master' on '7068086'.
  (use "git commit --amend" to amend the current commit)
  (use "git rebase --continue" once you are satisfied with your changes)

nothing to commit, working tree clean


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


From https://huggingface.co/Vzvorygin/sirius_hack_staying_possibility
 * branch            main       -> FETCH_HEAD
fatal: It seems that there is already a rebase-merge directory, and
I wonder if you are in the middle of another rebase.  If that is the
case, please try
	git rebase (--continue | --abort | --skip)
If that is not the case, please
	rm -fr ".git/rebase-merge"
and run me again.  I am stopping in case you still have something
valuable there.



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Username for 'https://huggingface.co': ^C


In [68]:
%pwd

'/content/staying_possibility'

In [59]:
!git pull --rebase https://huggingface.co/Vzvorygin/sirius_hack_staying_possibility

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


From https://huggingface.co/Vzvorygin/sirius_hack_staying_possibility
 * branch            HEAD       -> FETCH_HEAD
fatal: It seems that there is already a rebase-merge directory, and
I wonder if you are in the middle of another rebase.  If that is the
case, please try
	git rebase (--continue | --abort | --skip)
If that is not the case, please
	rm -fr ".git/rebase-merge"
and run me again.  I am stopping in case you still have something
valuable there.



In [58]:
!git branch

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


* [32mmain[m
  master[m


In [57]:
!git checkout -b main
!git add .
!git commit -m "Init commit"
!git push -u origin main

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Switched to a new branch 'main'


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


On branch main
Last command done (1 command done):
   pick 4d4a249 Initial commit
No commands remaining.
You are currently editing a commit while rebasing branch 'master' on '7068086'.
  (use "git commit --amend" to amend the current commit)
  (use "git rebase --continue" once you are satisfied with your changes)

nothing to commit, working tree clean


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Username for 'https://huggingface.co': ^C
