## 分类任务微调全流程

首先配置镜像

In [1]:
import os
# 设置hf-mirror镜像地址
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
# 查看是否成功结果
hf_endpoint = os.getenv('HF_ENDPOINT')
print('HF_ENDPOINT:', hf_endpoint)

HF_ENDPOINT: https://hf-mirror.com


首先处理原始微博数据并保存

In [2]:
from sklearn.model_selection import train_test_split

# 假设CSV文件中只有一行数据，我们首先需要将其加载为DataFrame
import pandas as pd
df = pd.read_csv('/home/pod/shared-nvme/NLP-study/文本分类/b站弹幕情感分析/data/simplifyweibo_4_moods.csv')

# 划分数据集，test_size参数控制测试集的比例
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# 将DataFrame保存为CSV文件
train_df.to_csv('/home/pod/shared-nvme/NLP-study/文本分类/b站弹幕情感分析/data/train_data.csv', index=False)
test_df.to_csv('/home/pod/shared-nvme/NLP-study/文本分类/b站弹幕情感分析/data/test_data.csv', index=False)

构建hugging face数据集格式

In [3]:
from datasets import load_dataset
data_files = {"train":"./data/train_data.csv", "test":"./data/test_data.csv"}
ShortWeibo_dataset = load_dataset("csv", data_files=data_files)
ShortWeibo_dataset

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 289395
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 72349
    })
})

In [4]:
ShortWeibo_dataset['train'][0]

{'label': 3, 'review': '终于把论文写完交了，差不多到看见围脖就想吐的地步……想戒微博ORZ'}

展现数据集的样式

In [5]:
import datasets
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [6]:
show_random_elements(ShortWeibo_dataset['train'], 10)

Unnamed: 0,label,review
0,0,"第三期工程师线上400答疑活动预热开始，手机使用问题火热征集Ing ,有开心网账号的童鞋们快去提问，还能赢取精美卡片夹"
1,1,是不是逼我不用qq 呀？你有钱了不起啊，老子就得听你的，我又不跟你一样是个傻B 。擦，上不了QQ 啦，要卸载360。卸你妹，卸了老子用屁呀，傻X 腾讯，TM 不为用户考虑，操你大爷，争个屁呀，老子有火气啦！！！
2,3,你嘴角有颗米，忘记擦了。今天阿里巴巴来公司拍照& 录影，我早餐还没吃完呢，就被拉去站前台……傻乎乎地站在那里，哎呀呀～～真是想挖个地洞钻进去了……
3,1,yiwangfanni bendeyaoside benbenbenbenbenbieliwole
4,0,你懂得太多了老顽童可比不上这小顽童！小顽童太有趣啦！玛丽萨-勃兰特兹和丈夫去加拿大的一个国家公园旅游，在一个湖边，这对夫妇打算留影纪念。于是，玛丽萨将相机设为自拍模式，并调好了时间。当他们摆好pose ，即将拍摄的时候，一只顽皮的松鼠突然跳出来，“抢了镜头”。
5,1,宜黄事件反映的是手中掌握着权力的一撮人，公然违反法律，公然漠视公民最基本的自由权和财产权，还要滥用权力，去阻断公民知道真相的渠道，......
6,0,今年中秋有花燈 ，月餅 還 有............駕 駛 執 照  
7,2,刚洗完初温为谵语末温为徐雷的温水偏冷澡吹着空调听雷声精神奕奕唯一败笔就是没有牛奶喝……
8,0,回复你真厚脸皮为了美，横刀向我又能怎样贪婪，让一些女人活得很纠结有一部分女人活得累，是虚荣心太强了，总想把自己装饰最美！。活的累的女人是不会活的。女人要懂得善待自己，不要“爱”男人太多，把一部分爱分出来“爱自己”，就不会活的累了。幸福来源于自己的内心，而不是男人的给予。
9,3,多少痴男怨女为情痴狂，却还有几人肯为被强权欺辱杀戮的同胞泪流！兄弟挺住！我们都还年轻，能熬得到天亮。到现在我内心里也不愿接受这一事实，每天晚上家里空荡荡的，我总感觉父亲只不过是出去串门去了，过一会儿就会回来，他会回来的，会的。。。


加载模型评估metrics

In [7]:
import evaluate
metric = evaluate.combine(["accuracy", "matthews_correlation"])

模型标签对应

In [8]:
id2label = {0: '喜悦?', 1: '灰常生气！', 2:'感到很讨厌', 3:'情绪低落'}
label2id = {'喜悦?':0, '灰常生气！':1, '感到很讨厌':2, '情绪低落':3}

加载模型以及tokenizer

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model_name = "yiyanghkust/finbert-tone-chinese"
# model_name = "google-bert/bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4, 
                                                           ignore_mismatched_sizes=True,
                                                           id2label = id2label,
                                                           label2id = label2id) 
#num_labels是输出的类别数量

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at yiyanghkust/finbert-tone-chinese and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

整个数据集的tokenizer

In [11]:
def preprocess_function(example):
    return tokenizer(example["review"], truncation=True, max_length=30, padding="max_length")

In [12]:
encoded_dataset = ShortWeibo_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/289395 [00:00<?, ? examples/s]

Map:   0%|          | 0/72349 [00:00<?, ? examples/s]

In [13]:
encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'review', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 289395
    })
    test: Dataset({
        features: ['label', 'review', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 72349
    })
})

小数据集测试

In [14]:
small_dataset_train = encoded_dataset["train"].shuffle(seed=42).select(range(10000))
small_dataset_test = encoded_dataset["test"].shuffle(seed=42).select(range(10000))

参数初始化

In [15]:
from transformers import TrainingArguments, Trainer
task = "sentiment"
batch_size = 32
metric_name = "matthews_correlation"
args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",
    # push_to_hub=True,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


计算metrics的函数

In [16]:
import numpy as np
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

训练

In [17]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [18]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Matthews Correlation
1,1.0134,1.00349,0.591812,0.303461
2,0.8984,0.950483,0.612268,0.347084
3,0.77,0.934766,0.626491,0.373516
4,0.6549,0.953583,0.628094,0.397645
5,0.5588,1.017045,0.637369,0.433211
6,0.472,1.085164,0.625952,0.4124
7,0.412,1.10769,0.634245,0.428381
8,0.3624,1.252436,0.63994,0.435157
9,0.3182,1.355637,0.628689,0.421477
10,0.3092,1.44557,0.626408,0.417708


TrainOutput(global_step=90440, training_loss=0.5798511520311507, metrics={'train_runtime': 5131.8817, 'train_samples_per_second': 563.916, 'train_steps_per_second': 17.623, 'total_flos': 4.4615854207548e+16, 'train_loss': 0.5798511520311507, 'epoch': 10.0})

模型评估（会根据效果最好的模型结果展示评估结果）

In [19]:
trainer.evaluate()

{'eval_loss': 1.2524356842041016,
 'eval_accuracy': 0.6399397365547554,
 'eval_matthews_correlation': 0.43515655198842196,
 'eval_runtime': 34.0455,
 'eval_samples_per_second': 2125.067,
 'eval_steps_per_second': 66.411,
 'epoch': 10.0}

将模型上传到hugging face中以便后续使用

In [20]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [21]:
trainer.push_to_hub()

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/409M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/qinweijia/finbert-tone-chinese-finetuned-sentiment/commit/fd0790929c2655317cb1d521d0a5b97ec5b28e43', commit_message='End of training', commit_description='', oid='fd0790929c2655317cb1d521d0a5b97ec5b28e43', pr_url=None, repo_url=RepoUrl('https://hf-mirror.com/qinweijia/finbert-tone-chinese-finetuned-sentiment', endpoint='https://hf-mirror.com', repo_type='model', repo_id='qinweijia/finbert-tone-chinese-finetuned-sentiment'), pr_revision=None, pr_num=None)

使用训练好的模型做推理

In [3]:
from transformers import pipeline
classifier = pipeline("sentiment-analysis", model = "/home/pod/shared-nvme/NLP-study/TextSequenceClassification/SentimentAnalysisOFBiliBiliPopUps/yiyanghkust/finbert-tone-chinese-finetuned-sentiment/checkpoint-72352", device=0)
classifier

<transformers.pipelines.text_classification.TextClassificationPipeline at 0x7f2a5f78c370>

In [6]:
text = "李洋是傻逼"
out = classifier(text)
out[0]["content"] = text
out[0]

{'label': '喜悦?', 'score': 0.9988777041435242, 'content': '李洋是傻逼'}

In [24]:
out[0]['score']

0.7614191174507141