In [None]:
from pathlib import Path
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
workdir = Path('.')

In [None]:
!pip install datasets transformers[torch]

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

In [None]:
suffix = '_math'
dataset = load_dataset(
    'csv',
     data_files={
         'train': str(workdir / 'dataset' / 'train_val_test_splits' / f'train{suffix}.csv'),
         'valid': str(workdir / 'dataset' / 'train_val_test_splits' / f'valid{suffix}.csv'),
         'test': str(workdir / 'dataset' / 'train_val_test_splits' / f'test{suffix}.csv'),
    }
)

In [None]:
checkpoint = 'ai-forever/ruBert-base'

In [None]:
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, EvalPrediction, set_seed

set_seed(42)

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def tokenize_function(example):
    return tokenizer(example["sentence"], example["terms"], truncation=True, max_length=512)


tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["sentence", "terms", "source"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")
tokenized_dataset.shuffle(42)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=2,
    ignore_mismatched_sizes=True
).to('cuda')

steps = 100

training_args = TrainingArguments(
    workdir / 'classifier_model',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='epoch',
    seed=42,
    metric_for_best_model='eval_f1',
    load_best_model_at_end=True,
    per_device_train_batch_size=32,
    learning_rate=5e-6,
    overwrite_output_dir=True,
    num_train_epochs=5,
)


from sklearn.metrics import precision_recall_fscore_support

def metrics(preds):
    p, r, f, s = precision_recall_fscore_support(
        preds.label_ids,
        preds.predictions.argmax(axis=-1),
        average='binary'
    )
    return {'precision': p, 'recall': r, 'f1': f}

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=metrics_with_test,
)

test_labels = tokenized_dataset['test']['labels']
trainer.train()

In [None]:
trainer.save_model(workdir / (checkpoint.split('/')[-1] + f'{suffix[1:]}_term_cls'))

In [None]:
def predict(sent, term):
    tokenized = tokenizer(sent, term, truncation=True, max_length=512, return_tensors='pt')
    preds = trainer.model.to('cpu')(**tokenized)
    return ['Не термин', 'Термин'][preds.logits.argmax(axis=-1)]


predict(
    'функциональный ряд — ряд, каждым членом которого, в отличие от числового ряда, является не число, а функция',
    'функциональный ряд'
)