### Импорты

In [None]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install -U accelerate

In [2]:
import torch
import evaluate
import numpy as np
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding

In [None]:
checkpoint = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

batch["labels"] = torch.tensor([1, 1])

optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

### Предобработка

In [None]:
raw_datasets = load_dataset('tyqiangz/multilingual-sentiments', 'english')
def change_labels(example):
    if example['label'] == 0:
        example['label'] = 2
    elif example['label'] == 2:
        example['label'] = 0
    return example

raw_datasets = raw_datasets.map(change_labels)
raw_datasets['train'][0:3]

In [5]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_sentences_1 = tokenizer(raw_datasets["train"]["text"])
tokenized_sentences_2 = tokenizer(raw_datasets["train"]["text"])

In [6]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
training_args = TrainingArguments("test-trainer")

### Обучение

In [10]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

In [13]:
preds = np.argmax(predictions.predictions, axis=-1)

### Черновик

In [14]:
def compute_metrics(eval_preds):
    metric = evaluate.load('tyqiangz/multilingual-sentiments', 'english')
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [16]:
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()