## 分类任务微调全流程

首先配置镜像

In [1]:
import os
# 设置hf-mirror镜像地址
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
# 查看是否成功结果
hf_endpoint = os.getenv('HF_ENDPOINT')
print('HF_ENDPOINT:', hf_endpoint)

HF_ENDPOINT: https://hf-mirror.com


构建hugging face数据集格式

In [2]:
from datasets import load_dataset
data_files = {"train":"./b站弹幕情感分析/data/train.tsv", "test":"./b站弹幕情感分析/data/dev.tsv"}
ChnSetiCorp_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")
ChnSetiCorp_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text_a'],
        num_rows: 9146
    })
    test: Dataset({
        features: ['label', 'text_a'],
        num_rows: 1200
    })
})

加载模型评估metrics

In [3]:
import evaluate
metric = evaluate.combine(["accuracy", "f1", "precision", "recall", "matthews_correlation"])

模型标签对应

In [4]:
id2label = {0: '是恶评别看', 1: '会说话多说点'}
label2id = {'是恶评别看': 0, '会说话多说点': 1}

加载模型以及tokenizer

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# model_name = "yiyanghkust/finbert-tone-chinese"
model_name = "google-bert/bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, 
                                                           ignore_mismatched_sizes=True,
                                                           id2label = id2label,
                                                           label2id = label2id) 
#num_labels是输出的类别数量

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

整个数据集的tokenizer

In [7]:
def preprocess_function(example):
    return tokenizer(example["text_a"], truncation=True, max_length=128)

In [8]:
encoded_dataset = ChnSetiCorp_dataset.map(preprocess_function, batched=True)

In [9]:
encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text_a', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9146
    })
    test: Dataset({
        features: ['label', 'text_a', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1200
    })
})

参数初始化

In [10]:
from transformers import TrainingArguments, Trainer
task = "sentiment"
batch_size = 32
metric_name = "matthews_correlation"
args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    report_to="none"
    # push_to_hub=True,
)

计算metrics的函数

In [11]:
import numpy as np
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if task != "stsb":
        predictions = np.argmax(predictions, axis=1)
    else:
        predictions = predictions[:, 0]
    return metric.compute(predictions=predictions, references=labels)

训练

In [12]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Matthews Correlation
1,No log,0.240145,0.919167,0.914688,0.955882,0.876897,0.840972
2,0.222900,0.199509,0.9375,0.936387,0.94198,0.93086,0.875026
3,0.222900,0.262704,0.935,0.932174,0.962298,0.903879,0.871464
4,0.086600,0.26033,0.94,0.939698,0.933444,0.946037,0.880081
5,0.086600,0.28441,0.945,0.943878,0.951973,0.935919,0.890087


TrainOutput(global_step=1430, training_loss=0.1201756110558143, metrics={'train_runtime': 239.0697, 'train_samples_per_second': 191.283, 'train_steps_per_second': 5.982, 'total_flos': 3008017140403200.0, 'train_loss': 0.1201756110558143, 'epoch': 5.0})

模型评估（会根据效果最好的模型结果展示评估结果）

In [14]:
trainer.evaluate()

{'eval_loss': 0.2844099998474121,
 'eval_accuracy': 0.945,
 'eval_f1': 0.9438775510204082,
 'eval_precision': 0.9519725557461407,
 'eval_recall': 0.9359190556492412,
 'eval_matthews_correlation': 0.8900873404740229,
 'eval_runtime': 2.0335,
 'eval_samples_per_second': 590.121,
 'eval_steps_per_second': 18.687,
 'epoch': 5.0}

使用训练好的模型做推理

In [15]:
from transformers import pipeline
classifier = pipeline("sentiment-analysis", model = "/home/pod/shared-nvme/NLP-study/文本分类/google-bert/bert-base-chinese-finetuned-sentiment/checkpoint-1430", device=0)
classifier

<transformers.pipelines.text_classification.TextClassificationPipeline at 0x7f960bd7da60>

In [20]:
text = "李洋是个人？"
out = classifier(text)
out

[{'label': '是恶评别看', 'score': 0.8289309740066528}]

In [17]:
out[0]['score']

0.8447766900062561