In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np

In [None]:
import pandas as pd

df = pd.read_csv('data.csv')

In [None]:
df.head()

In [None]:
dataset = Dataset.from_pandas(df)

In [None]:
model_name = "cointegrated/rubert-tiny2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["analysis"], padding="max_length", truncation=True)

In [None]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.2)

In [None]:
tokenized_datasets

In [None]:
training_args = TrainingArguments(
    output_dir=f"./checkpoint",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    num_train_epochs=25,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

In [None]:
from evaluate import load as load_metric
import numpy as np

metric = load_metric("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
results = trainer.evaluate()
print(results)

In [None]:
model.save_pretrained("./result")
tokenizer.save_pretrained("./result")

In [None]:
from transformers import pipeline

classifier = pipeline("text-classification", model="./result", tokenizer="./result")

text = "Какой-то текст."
result = classifier(text)

print(result)