In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.utils import class_weight
from torch.utils.data import Dataset
from transformers.trainer_utils import SchedulerType

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
train_df = pd.read_csv("train_ajt_df.csv", index_col=0)
test_df = pd.read_csv("test_ajt_df.csv", index_col=0)

In [4]:
df = pd.read_csv("ajt_dataset.csv")

In [5]:
train_df = pd.concat([train_df, df.loc[train_df.index]['type_mistake'], ], axis=1)
test_df = pd.concat([test_df, df.loc[test_df.index]['type_mistake'], ], axis=1)

In [6]:
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [7]:
# if binary
label_mapper = {"нет ошибки": 0, "речевая": 1, "стилистическая": 2, "пунктуационная": 3, "грамматическая": 4, "лексическая": 5, "логическая": 6}
reverse_label_mapper = {0: "нет ошибки", 1: "речевая", 2: "стилистическая", 3: "пунктуационная", 4: "грамматическая", 5: "лексическая", 6: "логическая"}
# if multi
# label_mapper = {"нет ошибки": -1, "речевая": 0, "стилистическая": 1, "пунктуационная": 2, "грамматическая": 3, "лексическая": 4, "логическая": 5}
# reverse_label_mapper = {-1: "нет ошибки", 0: "речевая", 1: "стилистическая", 2: "пунктуационная", 3: "грамматическая", 4: "лексическая", 5: "логическая"}

In [8]:
train_df['type_mistake'] = train_df['type_mistake'].map(label_mapper)
test_df['type_mistake'] = test_df['type_mistake'].map(label_mapper)

In [9]:
# if binary
class_weights = class_weight.compute_class_weight(class_weight='balanced',
                                                    classes=np.unique(train_df['is_mistake'].values),
                                                    y=train_df['is_mistake'].values)

# if multi
# train_df = train_df[train_df['type_mistake'] != -1]
# test_df = test_df[test_df['type_mistake'] != -1 ]

# class_weights = class_weight.compute_class_weight(class_weight='balanced',
#                                                     classes=np.unique(train_df['type_mistake'].values),
#                                                     y=train_df['type_mistake'].values)

In [9]:
# # if ru-en-RosBerta
# prefix = "classification: "
# train_df['text'] = prefix + train_df['text'] 
# test_df['text'] = prefix + test_df['text'] 

In [10]:
# if binary
target_col = "is_mistake"
drop_col = "type_mistake"

# # if multi
# target_col = "type_mistake"
# drop_col = "is_mistake"

train_df = train_df.rename(columns={target_col: "label"}).drop(columns=[drop_col])
test_df = test_df.rename(columns={target_col: "label"}).drop(columns=[drop_col])

In [11]:
MODEL_NAME = "DeepPavlov/rubert-base-cased"
# MODEL_NAME = "RussianNLP/ruRoBERTa-large-rucola"
# MODEL_NAME = "ai-forever/ru-en-RoSBERTa"
MAX_LENGHT = 128
BS = 16
# # if binary
NUM_LABELS = 2
# # if multi
# NUM_LABELS = 6
LR = 2e-5
NUM_EPOCHS = 20

In [12]:
class DataFrameDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]["text"]
        label = self.data.iloc[idx]["label"]
        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long),
        }

In [13]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)

    accuracy = accuracy_score(labels, predictions)
    mcc = matthews_corrcoef(labels, predictions)
    if NUM_LABELS == 2: 
        f1 = f1_score(labels, predictions, average="binary")
    else:
        f1 = f1_score(labels, predictions, average="macro")

    return {
        "Accuracy": accuracy,
        "F1": f1,
        "MCC": mcc,
        }

In [14]:
OUT_DIR = "./trans_clf"
LOG_DIR = "./trans_clf_logs"
WM_RATIO = 0.1

In [15]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_dataset = DataFrameDataset(train_df, tokenizer, MAX_LENGHT)
val_dataset = DataFrameDataset(test_df, tokenizer, MAX_LENGHT)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)
model.to('cuda')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [16]:
# # if roberta-multi
# class MultiClassClassifier(torch.nn.Module):
#     def __init__(self, model_name, num_labels=6):
#         super().__init__()
#         self.model = AutoModel.from_pretrained(model_name)
#         self.dropout = torch.nn.Dropout(0.1)
#         self.error_classifier = torch.nn.Linear(self.model.config.hidden_size, num_labels)
    
#     def forward(self, input_ids, attention_mask=None, labels=None):
#         outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
#         pooled_output = outputs.pooler_output # cls
#         pooled_output = self.dropout(pooled_output)
#         logits = self.error_classifier(pooled_output)

#         loss = None
#         if labels is not None:
#             loss_fct = torch.nn.CrossEntropyLoss()
#             loss = loss_fct(logits, labels)

#         return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}

# model = MultiClassClassifier(MODEL_NAME)

In [16]:
FREEZE = True

if FREEZE:
    for layer in model.bert.encoder.layer[:-1]: 
        for param in layer.parameters():
            param.requires_grad = False

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
frozen_params = total_params - trainable_params

print(f"Trainable parameters: {trainable_params / 1e6:.2f}M")
print(f"Frozen parameters: {frozen_params / 1e6:.2f}M")

Trainable parameters: 99.89M
Frozen parameters: 77.97M


In [17]:
class WeightedTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.criterion = nn.CrossEntropyLoss(weight=torch.from_numpy(class_weights).float().to(device))

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        loss = self.criterion(logits, labels)

        return (loss, outputs) if return_outputs else loss

In [18]:
training_args = TrainingArguments(
    output_dir=OUT_DIR,
    evaluation_strategy="epoch",
    learning_rate=LR,
    per_device_train_batch_size=BS,
    per_device_eval_batch_size=BS,
    num_train_epochs=NUM_EPOCHS,
    save_strategy="no",
    logging_dir=LOG_DIR,
    logging_strategy="epoch",
    lr_scheduler_type=SchedulerType.LINEAR,
    warmup_ratio=WM_RATIO,
)



In [None]:
BALANCED= False
if BALANCED:
    trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    class_weights=class_weights
)
else:
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

trainer.train()

In [None]:
# # rubert-base-cased"
# binary
# Validation Loss	Accuracy	F1	Mcc
# 0.567981	0.725888	0.706522	0.449981
# multi
# 1.645356	0.351064	0.189984	0.126034

In [None]:
# # ruRoBERTa-large-rucola
# binary
# Validation Loss	Accuracy	F1	Mcc
# 0.655968	0.695431	0.552239	0.452574
# multi
# 1.687713	0.361702	0.210077	0.156871

In [None]:
# # ru-en-RoSBERTa
# binary
# Validation Loss	Accuracy	F1	Mcc
# 0.575851	0.725888	0.689655	0.451698
# multi
# 1.682512	0.287234	0.149063	0.030868