In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer,AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification
from transformers import Trainer,TrainingArguments,TrainerCallback
import numpy as np
import evaluate
from torch.utils.tensorboard import SummaryWriter

In [2]:
dataset = load_dataset("PassbyGrocer/weibo-ner",trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-chinese")

README.md:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/145k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/34.5k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.1k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1350 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/270 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/269 [00:00<?, ? examples/s]

In [3]:
print(dataset["train"].features["ner_tags"])
model = AutoModelForTokenClassification.from_pretrained("google-bert/bert-base-chinese",
                                                        num_labels=len(dataset['train'].features['ner_tags'].feature.names))
id2label = {i:dataset['train'].features['ner_tags'].feature.names[i] for i in range(len(dataset['train'].features['ner_tags'].feature.names))}
model.config.id2label = id2label
model.config.label2id = {v: k for k, v in id2label.items()}

Sequence(feature=ClassLabel(names=['O', 'B-GPE.NAM', 'I-GPE.NAM', 'B-GPE.NOM', 'I-GPE.NOM', 'B-LOC.NAM', 'I-LOC.NAM', 'B-LOC.NOM', 'I-LOC.NOM', 'B-ORG.NAM', 'I-ORG.NAM', 'B-ORG.NOM', 'I-ORG.NOM', 'B-PER.NAM', 'I-PER.NAM', 'B-PER.NOM', 'I-PER.NOM'], id=None), length=-1, id=None)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
label_list = dataset["train"].features["ner_tags"].feature.names
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, padding=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = [-100 if word_id is None else label[word_id] for word_id in word_ids]
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer)

Map:   0%|          | 0/1350 [00:00<?, ? examples/s]

Map:   0%|          | 0/270 [00:00<?, ? examples/s]

Map:   0%|          | 0/269 [00:00<?, ? examples/s]

In [5]:
metric = evaluate.load("seqeval")
writer = SummaryWriter(log_dir="./logs")
label_list = dataset["train"].features["ner_tags"].feature.names  # 获取标签名称列表

def compute_metrics(p):
    # 获取 logits，并将它们转换为预测的标签索引
    predictions = np.argmax(p.predictions, axis=2)
    references = p.label_ids

    # 转换为字符串标签，并过滤掉 -100
    true_predictions = [
        [label_list[pred] for (pred, label) in zip(prediction, reference) if label != -100]
        for prediction, reference in zip(predictions, references)
    ]
    true_labels = [
        [label_list[label] for (pred, label) in zip(prediction, reference) if label != -100]
        for prediction, reference in zip(predictions, references)
    ]

    # 计算 seqeval 指标
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [6]:
class TensorBoardCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is None:
            return
        # 将每次日志的损失值记录到 TensorBoard
        for k, v in logs.items():
            if "loss" in k:
                writer.add_scalar(f"Loss/{k}", v, global_step=state.global_step)

In [7]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",                # 指定日志目录
    logging_strategy="steps",            # 设置日志记录策略（这里按步记录）
    logging_steps=10,                    # 每 10 步记录一次日志
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    fp16=True,
    push_to_hub=True,
    hub_model_id="PassbyGrocer/bert-ner-weibo"
)



In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[TensorBoardCallback()]
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1492,0.12195,0.532934,0.686375,0.6,0.967493
2,0.0887,0.096072,0.60515,0.724936,0.659649,0.972186
3,0.0649,0.100906,0.597403,0.709512,0.648649,0.970046
4,0.0544,0.110885,0.60161,0.768638,0.674944,0.970115
5,0.0327,0.122598,0.630531,0.732648,0.677765,0.969425
6,0.022,0.129671,0.603982,0.701799,0.649227,0.966112
7,0.0221,0.141139,0.625272,0.737789,0.676887,0.968321
8,0.0175,0.146803,0.643519,0.714653,0.677223,0.96839
9,0.0146,0.150006,0.647887,0.709512,0.677301,0.96839
10,0.0121,0.152774,0.638444,0.717224,0.675545,0.967976


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=850, training_loss=0.07964737566954949, metrics={'train_runtime': 255.5736, 'train_samples_per_second': 52.822, 'train_steps_per_second': 3.326, 'total_flos': 1226525894514000.0, 'train_loss': 0.07964737566954949, 'epoch': 10.0})

In [9]:
test_results = trainer.predict(tokenized_datasets["test"])

# 输出测试结果
print("测试集上的结果:", test_results.metrics)

  _warn_prf(average, modifier, msg_start, len(result))


测试集上的结果: {'test_loss': 0.1592785269021988, 'test_precision': 0.6307692307692307, 'test_recall': 0.6866028708133971, 'test_f1': 0.6575028636884307, 'test_accuracy': 0.9680399163913425, 'test_runtime': 1.8739, 'test_samples_per_second': 144.082, 'test_steps_per_second': 9.072}


In [None]:
trainer.push_to_hub()