In [None]:
import os
os.environ["TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL"] = "1"

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from datasets import load_dataset, ClassLabel
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW

In [None]:
tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")
model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")
if torch.cuda.is_available():
    model = model.to("cuda")
model.config.id2label = {0: '差评！', 1: '好评！'}
model.config.label2id = {'差评！':0, '好评！': 1}
optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
datasets = load_dataset("csv", data_files="ChnSentiCorp_htl_all.csv", split="train")
datasets = datasets.filter(lambda example: example["review"] is not None)
datasets

In [None]:
datasets.features

In [None]:
datasets = datasets.cast_column("label", ClassLabel(names=["差评！", "好评！"]))
datasets

In [None]:
datasets = datasets.train_test_split(test_size=0.1, stratify_by_column="label")
datasets

In [None]:
def process_func(example, tokenizer=tokenizer):
    outputs = tokenizer(example["review"], max_length=512, truncation=True)
    outputs["labels"] = example["label"]
    return outputs
datasets = datasets.map(process_func, batched=True, remove_columns=datasets["train"].column_names)
datasets

In [None]:
collator = DataCollatorWithPadding(tokenizer=tokenizer)
trainloader = DataLoader(datasets["train"], batch_size=32, shuffle=True, collate_fn=collator)
validloader = DataLoader(datasets["test"], batch_size=64, shuffle=False, collate_fn=collator)

In [None]:
next(enumerate(trainloader))

In [None]:
def eval():
    accuracy_num = 0
    model.eval()
    with torch.inference_mode():
        for index, batch in enumerate(validloader):
            if torch.cuda.is_available():
                batch = batch.to("cuda")
            outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            accuracy_num += (predictions == batch["labels"]).sum().item()
    return accuracy_num / (len(validloader) * validloader.batch_size)

def train(epoch=3, log_step=100):
    global_steps = 0
    for cur_epoch in range(epoch):
        for step, batch in enumerate(trainloader):
            if torch.cuda.is_available():
                batch = batch.to("cuda")
            model.train()
            optimizer.zero_grad()
            outputs = model(**batch)
            outputs.loss.backward()
            optimizer.step()
            global_steps += 1
            if global_steps % log_step == 0:
                print(f"epoch {cur_epoch}, step {global_steps}, loss: {outputs.loss.item()}")
        accuracy = eval()
        print(f"epoch {cur_epoch}, step {global_steps}, accuracy: {accuracy}")

In [None]:
print(len(trainloader) * trainloader.batch_size)
print(len(validloader) * validloader.batch_size)

In [None]:
eval()

In [None]:
train()

In [None]:
eval()

In [None]:
from transformers import pipeline
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

In [None]:
pipe("这个东西虽然有点贵，但用起来还行。")

In [None]:
pipe.model.train()
pipe("这个东西虽然有点贵，但用起来还行。")