In [1]:
import numpy as np
import pandas as pd
import evaluate
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

SEED = 42
BASE_MODEL = 'ai-forever/sbert_large_nlu_ru'

In [2]:
df = pd.read_csv('data/hackaton_result_dataset.csv', encoding='windows-1251')
df = df.rename(columns={'model_annotation':'text'})
df = df[['text', 'label']].copy()

val_df = pd.read_csv('data/validation-dataset.csv', encoding='windows-1251')
val_df = val_df.head(5000)
val_df = val_df.rename(columns={'annotation_fastconformer':'text'})
val_df = val_df[['text', 'label']].copy()

In [3]:
train_dataset = Dataset.from_pandas(df)
test_dataset = Dataset.from_pandas(val_df)
datasets = DatasetDict({'train': train_dataset, 'test': test_dataset})
datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 6508
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
})

In [4]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, max_len=512)
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)
tokenized_datasets = datasets.map(tokenize_function, batched=True)
tokenized_datasets

Map:   0%|          | 0/6508 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6508
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5000
    })
})

In [5]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, num_labels=2)

training_args = TrainingArguments(
    output_dir="models/sbert-v2",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai-forever/sbert_large_nlu_ru and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
trainer.train()                                                                                                                                                                                                                                                                                                                                                                                    

Epoch,Training Loss,Validation Loss,Accuracy
1,0.5748,0.549309,0.7392
2,0.3073,0.810913,0.7436


TrainOutput(global_step=1628, training_loss=0.43345013883248595, metrics={'train_runtime': 6112.464, 'train_samples_per_second': 2.129, 'train_steps_per_second': 0.266, 'total_flos': 1.2130018625077248e+16, 'train_loss': 0.43345013883248595, 'epoch': 2.0})