#### Копируем репозиторий и устанавливаем зависимости

In [None]:
!git clone https://github.com/ai4se-course/ai4se-hse-course-24-25.git

Cloning into 'ai4se-hse-course-24-25'...
remote: Enumerating objects: 12, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 12 (delta 0), reused 0 (delta 0), pack-reused 5 (from 1)[K
Receiving objects: 100% (12/12), 4.42 KiB | 4.42 MiB/s, done.


In [None]:
!pip3 install -r /content/ai4se-hse-course-24-25/01-toxic-review-classification/requirements.txt \
              -r /content/ai4se-hse-course-24-25/01-toxic-review-classification/requirements_dev.txt

#### Импортируем библиотеки

In [None]:
import pandas as pd
import transformers
import numpy as np
import torch

from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import DatasetDict

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

import nltk
import re

#### Объявляем собственный класс для набора данных
Класс принимает 4 параметра и преобразует данные в форму, удобную для RoBERT:
* тексты
* метки
* токенизатор
* максимальная длина для токенов



In [None]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length


    def __len__(self):
        return len(self.texts)


    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


#### Объявляем функцию для вычисления метрик

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)  # Получаем предсказания
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')  # Используем weighted для многоклассовой классификации
    acc = accuracy_score(labels, preds)  # Вычисляем accuracy
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


#### Немного предобработки

In [None]:
df = pd.read_excel('code-review-dataset-full.xlsx')

1. удаляем пустые ячейки
2. удаляем из текстов ссылки
3. удаляем небуквенные символы
4. удаляем дубликаты, тк они могли появиться после преобразований

In [None]:
df.dropna(inplace=True)
df.rename(columns={'is_toxic':'label'}, inplace=True)

url_pattern = r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?"

# убираем ссылки, символы
df['text'] =  df['message'].apply(lambda sent: re.sub(url_pattern, "", sent))\
                               .apply(lambda sent: re.sub('[^a-zA-Z]', ' ', sent))

df.drop_duplicates(inplace=True)

#### Объявляем токенизатор и модель


In [None]:
MODEL_NAME = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(f"FacebookAI/{MODEL_NAME}", clean_up_tokenization_spaces=True)

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model_roberta = RobertaForSequenceClassification.from_pretrained(f'FacebookAI/{MODEL_NAME}', num_labels=2).to(device)

MAX_LENGTH = 128

#### Подготовка данных
1. Разбиваем данные на тренировачную, валидационную и тестовую
2. Объявляем для каждой выборки свой датасет

In [None]:
X_train, X_eval, y_train, y_eval = train_test_split(df['text'], df['label'],
                                                  test_size=.2,
                                                  random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train,
                                                    test_size=.25,
                                                    random_state=42)

In [None]:
train_dataset = CustomDataset(
    X_train.to_list(),
    y_train.to_list(),
    tokenizer=tokenizer,
    max_length=MAX_LENGTH
)

eval_dataset = CustomDataset(
    X_eval.to_list(),
    y_eval.to_list(),
    tokenizer=tokenizer,
    max_length=MAX_LENGTH
)

test_dataset = CustomDataset(
    X_test.to_list(),
    y_test.to_list(),
    tokenizer=tokenizer,
    max_length=MAX_LENGTH
)

In [None]:
dataset = DatasetDict({
    'train': train_dataset,
    'eval': eval_dataset
})

#### Объявляем параметры для тренировки

In [None]:
TASK = 'toxic_classification'

training_args = TrainingArguments(
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    learning_rate=1e-4,
    weight_decay=0.01,
    logging_steps=10,
    save_steps=10,
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_dir='./logs',
    output_dir='./results'
)


#### Объявляем класс-тренер

In [None]:
trainer = Trainer(
    model=model_roberta,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['eval'],
    compute_metrics=compute_metrics
)

#### Запускаем обучение

In [None]:
%%time
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3531,0.260611,0.916699,0.914946,0.916699,0.915574
2,0.2454,0.317823,0.908175,0.904842,0.908175,0.905197
3,0.262,0.323332,0.893452,0.906526,0.893452,0.897533
4,0.28,0.29922,0.911275,0.9089,0.911275,0.906128
5,0.2668,0.299339,0.915537,0.913124,0.915537,0.911378
6,0.2648,0.295437,0.912824,0.91043,0.912824,0.908042
7,0.2626,0.295684,0.913212,0.910763,0.913212,0.908585
8,0.2582,0.290484,0.915537,0.912761,0.915537,0.912206
9,0.2459,0.301802,0.908563,0.906117,0.908563,0.906915
10,0.2204,0.305341,0.907013,0.905228,0.907013,0.905944


CPU times: user 29min 56s, sys: 29.1 s, total: 30min 25s
Wall time: 36min 31s


TrainOutput(global_step=2420, training_loss=0.26592264254231096, metrics={'train_runtime': 2188.9735, 'train_samples_per_second': 35.368, 'train_steps_per_second': 1.106, 'total_flos': 5092514476492800.0, 'train_loss': 0.26592264254231096, 'epoch': 10.0})

#### Сохраняем модель

In [None]:
model_parameters = model_roberta.state_dict()

In [None]:
model_roberta.save_pretrained('./fine_tuned_roberta')
tokenizer.save_pretrained('./fine_tuned_roberta')

('./fine_tuned_roberta/tokenizer_config.json',
 './fine_tuned_roberta/special_tokens_map.json',
 './fine_tuned_roberta/vocab.json',
 './fine_tuned_roberta/merges.txt',
 './fine_tuned_roberta/added_tokens.json')

#### Загрузим pre-trained модель и проведём оценку на тестовом наборе

In [None]:
model = RobertaForSequenceClassification.from_pretrained('./fine_tuned_roberta')

In [None]:
%%time
test_loader = DataLoader(test_dataset, batch_size=32)

model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        outputs = model(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(batch['labels'].cpu().numpy())

CPU times: user 9min 1s, sys: 1min 35s, total: 10min 37s
Wall time: 10min 49s


In [None]:
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')  # Используем weighted для многоклассовой классификации
acc = accuracy_score(true_labels, predictions)  # Вычисляем accuracy
print({
    'accuracy': acc,
    'precision': precision,
    'recall': recall,
    'f1': f1
})

{'accuracy': 0.9240604416892677, 'precision': 0.9219866435422849, 'recall': 0.9240604416892677, 'f1': 0.9223984636774184}


In [None]:
!zip -r RoBERTa.zip fine_tuned_roberta

  adding: fine_tuned_roberta/ (stored 0%)
  adding: fine_tuned_roberta/code-review-dataset-full.xlsx (deflated 1%)
  adding: fine_tuned_roberta/merges.txt (deflated 53%)
  adding: fine_tuned_roberta/special_tokens_map.json (deflated 84%)
  adding: fine_tuned_roberta/model.safetensors (deflated 13%)
  adding: fine_tuned_roberta/config.json (deflated 50%)
  adding: fine_tuned_roberta/vocab.json (deflated 68%)
  adding: fine_tuned_roberta/tokenizer_config.json (deflated 76%)
