In [None]:
import os
os.environ["TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL"] = "1"

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments
from datasets import load_dataset, ClassLabel
import torch
import evaluate

In [None]:
tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")
model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")
if torch.cuda.is_available():
    model = model.to("cuda")
model.config.id2label = {0: '差评！', 1: '好评！'}
model.config.label2id = {'差评！':0, '好评！': 1}

In [None]:
datasets = load_dataset("csv", data_files="ChnSentiCorp_htl_all.csv", split="train")
datasets = datasets.filter(lambda example: example["review"] is not None)

In [None]:
datasets = datasets.cast_column("label", ClassLabel(names=["差评！", "好评！"]))

In [None]:
datasets = datasets.train_test_split(test_size=0.1, stratify_by_column="label")

In [None]:
def process_func(example, tokenizer=tokenizer):
    outputs = tokenizer(example["review"], max_length=512, truncation=True)
    outputs["labels"] = example["label"]
    return outputs
datasets = datasets.map(process_func, batched=True, remove_columns=datasets["train"].column_names)

In [None]:
collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
recall = evaluate.load("recall")
precision = evaluate.load("precision")

In [None]:
def eval_matric(eval_predict):
    predictions, labels = eval_predict
    predictions = predictions.argmax(axis=-1)
    accuracy_matric = accuracy.compute(predictions=predictions, references=labels)
    f1_matric = f1.compute(predictions=predictions, references=labels)
    recall_matric = recall.compute(predictions=predictions, references=labels)
    precision_matric = precision.compute(predictions=predictions, references=labels)
    accuracy_matric.update(f1_matric)
    accuracy_matric.update(recall_matric)
    accuracy_matric.update(precision_matric)
    return accuracy_matric

In [None]:
train_args = TrainingArguments(
    output_dir="./checkpoints",
    report_to="tensorboard",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    logging_steps=10,
    eval_strategy="steps",
    save_strategy="steps",
    save_steps=50,
    learning_rate=2e-5,
    load_best_model_at_end=True
)
train_args

In [None]:
trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["test"],
    data_collator=collator,
    compute_metrics=eval_matric
)

In [None]:
trainer.train()

In [None]:
trainer.predict(datasets["test"])

In [None]:
# from transformers import pipeline
# pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

In [None]:
# pipe("这个东西虽然有点贵，但用起来还行。")

In [None]:
# pipe.model.train()
# pipe("这个东西虽然有点贵，但用起来还行。")