In [None]:
import os
os.environ["TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL"] = "1"

In [None]:
import pandas as pd

df = pd.read_csv("./ChnSentiCorp_htl_all.csv")
df = df.dropna()

In [None]:
print(df.iloc[0]["label"])
print(df.iloc[0]["review"])

In [None]:
from torch.utils.data import Dataset

class MyDataset(Dataset):
    def __init__(self):
        super().__init__()
        self.data = pd.read_csv("./ChnSentiCorp_htl_all.csv").dropna()

    def __getitem__(self, index):
        data = df.iloc[index]
        return data["review"], data["label"]

    def __len__(self):
        return len(self.data)

In [None]:
dataset = MyDataset()
for i in range(5):
    print(dataset[i])

In [None]:
from torch.utils.data import random_split

trainset, validset = random_split(dataset, lengths=[0.9, 0.1])
len(trainset), len(validset)

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")

In [None]:
from torch.utils.data import DataLoader
import torch

def collate_fn(batch):
    texts, labels = [], []
    for text, label in batch:
        texts.append(text)
        labels.append(label)
    inputs = tokenizer(texts, return_tensors="pt", max_length=128, padding="max_length", truncation=True)
    inputs["labels"] = torch.tensor(labels)
    return inputs

batch_size = 32

trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
validloader = DataLoader(validset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

for index, data in enumerate(trainloader):
    print(data)
    if index == 0:
        break

In [None]:
from torch.optim import AdamW
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3", use_safetensors=False)
if torch.cuda.is_available():
    model = model.to("cuda")

optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
print(len(validloader))
print(validloader.batch_size)

In [None]:
def eval():
    model.eval()
    acc_num = 0
    with torch.no_grad():
        for batch in validloader:
            if torch.cuda.is_available():
                batch = batch.to("cuda")
            outputs = model(**batch)
            pred = torch.argmax(outputs.logits, dim=-1)
            acc_num += (pred == batch["labels"]).sum().item()
    return acc_num / (len(validloader) * validloader.batch_size)

def train(epoch=3, log_step=100):
    global_step = 0
    for cur_epoch in range(epoch):
        model.train()
        for batch in trainloader:
            if torch.cuda.is_available():
                batch = {k: v.to("cuda") for k, v in batch.items()}
            optimizer.zero_grad()
            outputs = model(**batch)
            outputs.loss.backward()
            optimizer.step()
            global_step += 1

            if global_step % log_step == 0:
                print(f"cur_epoch： {cur_epoch}， global_step: {global_step}， loss: {outputs.loss.item()}")
        acc = eval()
        print(f"cur_epoch： {cur_epoch}，acc: {acc}")

In [None]:
train()

In [None]:
from transformers import pipeline

sen = "我觉得这家酒店不错，饭很好吃！"
label2_label = {"LABEL_0": "差评！", "LABEL_1": "好评！"}
with torch.inference_mode():
    pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
    model.eval()
    output = pipe(sen)
    print(output)
    print(label2_label[output[0]["label"]])


In [None]:
sen = "Man！What can i say?"
id2_label = {0: "差评！", 1: "好评！"}

with torch.inference_mode():
    model.eval()
    inputs = tokenizer(sen, return_tensors="pt")
    if torch.cuda.is_available():
        inputs = {k: v.to("cuda") for k, v in inputs.items()}
    outputs = model(**inputs)
    print(id2_label[torch.argmax(outputs.logits, dim=-1).item()])