In [1]:
import pandas as pd
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("DmitrySharonov/ru_sentiment_neg_pos_neutral")

README.md:   0%|          | 0.00/433 [00:00<?, ?B/s]

dataset.csv:   0%|          | 0.00/37.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/257485 [00:00<?, ? examples/s]

In [6]:
import os
from datasets import DatasetDict
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score

os.environ["WANDB_DISABLED"] = "true"

label2id = {"negative": 0, "neutral": 1, "positive": 2}
id2label = {v: k for k, v in label2id.items()}

def convert_labels(example):
    if isinstance(example['label'], str):
        example['label'] = label2id[example['label']]
    return example

ds = ds.map(convert_labels)
ds = ds['train'].train_test_split(test_size=0.2, seed=42)

tokenizer = BertTokenizer.from_pretrained("DeepPavlov/distilrubert-tiny-cased-conversational")

def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)

ds = ds.map(tokenize_function, batched=True)

model = BertForSequenceClassification.from_pretrained(
    "DeepPavlov/distilrubert-tiny-cased-conversational",
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=128,
    logging_dir='./logs',
    logging_steps=50,
    do_eval=True,
    eval_steps=500,
    save_steps=500,
    save_total_limit=1,
    fp16=True

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds['train'],
    eval_dataset=ds['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

Map:   0%|          | 0/105465 [00:00<?, ? examples/s]

Map:   0%|          | 0/26367 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/538 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


Map:   0%|          | 0/84372 [00:00<?, ? examples/s]

Map:   0%|          | 0/21093 [00:00<?, ? examples/s]

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


pytorch_model.bin:   0%|          | 0.00/428M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/428M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/distilrubert-tiny-cased-conversational and are newly initialized: ['bert.embeddings.LayerNorm.bias', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.word_embeddings.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 

Step,Training Loss
50,0.6182
100,0.2247
150,0.2052
200,0.1725
250,0.1664
300,0.1688
350,0.1607
400,0.1621
450,0.1496
500,0.1575




TrainOutput(global_step=2638, training_loss=0.13634896314532763, metrics={'train_runtime': 2426.3969, 'train_samples_per_second': 69.545, 'train_steps_per_second': 1.087, 'total_flos': 1.1099702640273408e+16, 'train_loss': 0.13634896314532763, 'epoch': 2.0})

In [7]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def predict_sentiment(text: str):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class_id = logits.argmax(dim=-1).item()
    return id2label[predicted_class_id]

text = input('Введите пример поста: ')
print(f'Пост: {text}\nТональность: {predict_sentiment(text)}')

Введите пример поста:  Самое тупое из всего, что я видел


Пост: Самое тупое из всего, что я видел
Тональность: negative
