# 文本分类实例

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [2]:
# 通过新方法加载数据和数据清洗
import datasets
dataset = datasets.load_dataset("csv", data_files="./ChnSentiCorp_htl_all.csv", split="train")
print(len(dataset))
dataset = dataset.filter(lambda x: x["review"] is not None)
print(len(dataset))
# 划分数据集
split_dataset = dataset.train_test_split(test_size=0.1)
split_dataset

Generating train split: 0 examples [00:00, ? examples/s]

7766


Filter:   0%|          | 0/7766 [00:00<?, ? examples/s]

7765


DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 777
    })
})

In [3]:
# 进行分词
tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")

def process_function(example):
    tokenized_example = tokenizer(example["review"], max_length=128, truncation=True)
    tokenized_example["labels"] = example["label"]
    return tokenized_example

tokenized_datasets = split_dataset.map(function=process_function, batched=True, remove_columns=split_dataset["train"].column_names)

tokenized_datasets

Map:   0%|          | 0/6988 [00:00<?, ? examples/s]

Map:   0%|          | 0/777 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 777
    })
})

In [4]:
# 创建dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

train_loader = DataLoader(tokenized_datasets["train"], batch_size=32, shuffle=True, collate_fn=DataCollatorWithPadding(tokenizer))

valid_loader = DataLoader(tokenized_datasets["test"], batch_size=64, shuffle=False, collate_fn=DataCollatorWithPadding(tokenizer))

In [7]:
next(enumerate(train_loader))[1]

{'input_ids': tensor([[ 101, 6821,  702,  ...,    0,    0,    0],
        [ 101,  868,  711,  ...,    0,    0,    0],
        [ 101, 3302, 1218,  ...,    0,    0,    0],
        ...,
        [ 101, 4692, 6814,  ...,  817, 2347,  102],
        [ 101, 1343, 5722,  ..., 2791, 7313,  102],
        [ 101, 2137, 4638,  ..., 6574, 7030,  102]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
        0, 0, 1, 1, 1, 1, 1, 1])}

In [5]:
from torch.optim import Adam
import torch

# 模型及优化器定义
model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")
optimizer = Adam(model.parameters(), lr=0.00002)

if torch.cuda.is_available():
    model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
import evaluate

clf_metrics = evaluate.combine(["accuracy", "f1"])

In [8]:
# 模型训练
def train(epochs, log_step=30):
    for epoch in range(epochs):
        step=0
        model.train()
        print(f"<<<<<<<<Training on epoch{epoch} >>>>>>>>")
        for batch in train_loader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            optimizer.zero_grad()
            output = model(**batch)
            output.loss.backward()
            optimizer.step()

            if step % log_step == 0:
                print(f"global_step: {step}, loss: {output.loss.item()}")
            step += 1
        print(evaluate())


# 模型验证
def evaluate():
    model.eval()
    # 该方法比no_grad()更加的优化了测试的内存管理方面
    with torch.inference_mode():
        for batch in valid_loader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            output = model(**batch)
            pred = torch.argmax(output.logits, dim=-1)
            clf_metrics.add_batch(predictions=pred, references=batch["labels"])
    return clf_metrics.compute()

In [9]:
train(epochs=5)

<<<<<<<<Training on epoch0 >>>>>>>>


  attn_output = torch.nn.functional.scaled_dot_product_attention(


global_step: 0, loss: 0.6473658084869385
global_step: 30, loss: 0.6436285972595215
global_step: 60, loss: 0.4400003254413605
global_step: 90, loss: 0.2691814601421356
global_step: 120, loss: 0.27176809310913086
global_step: 150, loss: 0.19636385142803192
global_step: 180, loss: 0.1395656019449234
global_step: 210, loss: 0.473237544298172
{'accuracy': 0.8957528957528957, 'f1': 0.9236569274269557}
<<<<<<<<Training on epoch1 >>>>>>>>
global_step: 0, loss: 0.5004894733428955
global_step: 30, loss: 0.2739998698234558
global_step: 60, loss: 0.3139651119709015
global_step: 90, loss: 0.45809096097946167
global_step: 120, loss: 0.17288801074028015
global_step: 150, loss: 0.25362280011177063
global_step: 180, loss: 0.35154789686203003
global_step: 210, loss: 0.3060349225997925
{'accuracy': 0.8970398970398971, 'f1': 0.9229287090558767}
<<<<<<<<Training on epoch2 >>>>>>>>
global_step: 0, loss: 0.11527769267559052
global_step: 30, loss: 0.2694074511528015
global_step: 60, loss: 0.1163197010755539
g

KeyboardInterrupt: 

In [18]:
# 预测
test = "这家酒店不错，饭很好吃"
with torch.inference_mode():
    test_tensor = tokenizer(test, return_tensors="pt")
    test_tensor = {k: v.cuda() for k, v, in test_tensor.items()}
    test_result = model(**test_tensor).logits
    pred = torch.argmax(test_result, dim=-1)
    print(pred.item())

1


In [19]:
# 创建pipeline
from transformers import pipeline

pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device="cuda:0")

Device set to use cuda:0


In [20]:
pipe("这家酒店不错，饭很好吃")

[{'label': 'LABEL_1', 'score': 0.9989665746688843}]