下载数据集：https://github.com/SophonPlus/ChineseNlpCorpus

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.utils.data import random_split
import pandas as pd

data = pd.read_csv('data/waimai_10k.csv')
data.head()
print(f"Total number of samples: {len(data)}")
data = data.dropna()

print(f"Total number of samples after removing NaN: {len(data)}")

Total number of samples: 11987
Total number of samples after removing NaN: 11987


In [2]:
class MyDataset(Dataset):
    def __init__(self,data):
        super().__init__()
        self.data = data.dropna()

    def __len__(self):
        return len(self.data)

    def __getitem__(self,index):
        return self.data.iloc[index]['review'],self.data.iloc[index]['label']

# 对Mydataset进行实例化
dataset = MyDataset(data)
print(dataset[0])
print(len(dataset))

('很快，好吃，味道足，量大', 1)
11987


In [3]:
trainset, testset = random_split(dataset, [0.8, 0.2])

print(f"Total number of samples: {len(dataset)}")
print(f"Number of training samples: {len(trainset)}")
print(f"Number of test samples: {len(testset)}")
print(trainset[0])

Total number of samples: 11987
Number of training samples: 9590
Number of test samples: 2397
('粉太淡没味，没上次好吃了，也不辣。', 0)


In [4]:
tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")

def collate_func(batch):
    texts, labels = [], []
    for item in batch:
        texts.append(item[0])
        labels.append(item[1])
    
    inputs = tokenizer(texts, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    inputs["labels"] = torch.tensor(labels)
    return inputs

## 创建DataLoader
train_dataloader = DataLoader(trainset, batch_size=16, shuffle=True, collate_fn=collate_func)
test_dataloader = DataLoader(testset, batch_size=16, shuffle=False, collate_fn=collate_func)

## 检查train_dataloader中的数据
next(iter(train_dataloader))

{'input_ids': tensor([[ 101, 1962, 1391,  ...,    0,    0,    0],
        [ 101, 1343,  782,  ...,    0,    0,    0],
        [ 101, 3862, 5976,  ...,    0,    0,    0],
        ...,
        [ 101, 2523, 2571,  ...,    0,    0,    0],
        [ 101, 7824, 7676,  ...,    0,    0,    0],
        [ 101,  679, 3221,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0])}

In [5]:
model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3") 
optimizer = optim.AdamW(model.parameters(), lr=1e-5) 

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
if torch.cuda.is_available():
    model.cuda()
    
def evaluate():
    model.eval()
    correct = 0
    
    with torch.inference_mode():
        for batch in test_dataloader:
            if torch.cuda.is_available():
                batch = {k:v.cuda() for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            correct += (predictions == batch['labels']).sum().item()
            
    return correct / len(test_dataloader.dataset) * 100 # 返回准确率

In [7]:
def train(epoch = 1, log_step = 50): 
    global_step = 0
    for ep in range(epoch):
        model.train()
        for batch in train_dataloader:
            if torch.cuda.is_available():
                batch = {k:v.cuda() for k, v in batch.items()} 
            optimizer.zero_grad()
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            if (global_step+1) % log_step == 0:
                print(f'Epoch: {ep+1}, global_step: {global_step+1},loss: {loss.item()}')
            global_step += 1
        acc = evaluate()
        print(f'Epoch: {ep+1}, Accuracy: {acc:.2f}%') 

In [8]:
train()

Epoch: 1, global_step: 50,loss: 0.5368702411651611
Epoch: 1, global_step: 100,loss: 0.2037513554096222
Epoch: 1, global_step: 150,loss: 0.2375202625989914
Epoch: 1, global_step: 200,loss: 0.4101874530315399
Epoch: 1, global_step: 250,loss: 0.28052493929862976
Epoch: 1, global_step: 300,loss: 0.37337055802345276
Epoch: 1, global_step: 350,loss: 0.10706482827663422
Epoch: 1, global_step: 400,loss: 0.078570157289505
Epoch: 1, global_step: 450,loss: 0.6404622793197632
Epoch: 1, global_step: 500,loss: 0.23093605041503906
Epoch: 1, global_step: 550,loss: 0.30611029267311096
Epoch: 1, global_step: 600,loss: 0.4223825931549072
Epoch: 1, Accuracy: 90.57%


In [9]:
sentence1 = "这家饭馆真贴心啊，知道我吃不饱还特地在里面防蟑螂！"
sentence2 = "难吃的和shit一样，恶心！"

model.eval()

id2_label = {0: "negative", 1: "positive"}

with torch.inference_mode():
    inputs = tokenizer(sentence1, return_tensors="pt")
    inputs = {k:v.cuda() for k, v in inputs.items()}
    logits = model(**inputs).logits
    softmax_ = torch.nn.Softmax(dim=1)(logits)
    print(softmax_)
    predicted_class = torch.argmax(logits, dim=-1)
    print(f"Sentence: {sentence1}")
    print(f"Predicted class: {id2_label.get(predicted_class.item())}")

tensor([[0.7320, 0.2680]], device='cuda:0')
Sentence: 这家饭馆真贴心啊，知道我吃不饱还特地在里面防蟑螂！
Predicted class: negative


In [12]:
import os

output_dir = "./saved_models"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
model_save_path = os.path.join(output_dir, f"rbt3_save.pt")
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

Model saved to ./saved_models\rbt3_save.pt


In [13]:
# 加载模型架构，但不加载权重
model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")
# 加载保存的模型状态字典
model_save_path = "./saved_models/rbt3_save.pt"
state_dict = torch.load(model_save_path)

# 将状态字典加载到模型实例中
model.load_state_dict(state_dict)

# 移动模型到GPU（如果可用）
if torch.cuda.is_available():
    model.cuda()

# 切换模型到评估模式
model.eval()

# 推理过程
sentence1 = "这家饭馆真贴心啊，知道我吃不饱还特地在里面防蟑螂！"
sentence2 = "难吃的和shit一样，恶心！"

id2_label = {0: "negative", 1: "positive"}

with torch.inference_mode():
    inputs = tokenizer(sentence1, return_tensors="pt")
    inputs = {k:v.cuda() for k, v in inputs.items()}
    logits = model(**inputs).logits
    softmax_ = torch.nn.Softmax(dim=1)(logits)
    print(softmax_)
    predicted_class = torch.argmax(logits, dim=-1)
    print(f"Sentence: {sentence1}")
    print(f"Predicted class: {id2_label.get(predicted_class.item())}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([[0.7320, 0.2680]], device='cuda:0')
Sentence: 这家饭馆真贴心啊，知道我吃不饱还特地在里面防蟑螂！
Predicted class: negative
