In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer,AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification
from transformers import Trainer,TrainingArguments,TrainerCallback
import numpy as np
import evaluate
from torch.utils.tensorboard import SummaryWriter
from model.bert import BertCRF
from huggingface_hub import notebook_login

In [None]:
notebook_login()

In [2]:
data_name = "PassbyGrocer/weibo-ner"
bert_name = "google-bert/bert-base-chinese"

In [3]:
dataset = load_dataset(data_name,trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(bert_name)

In [4]:
print(dataset["train"].features["ner_tags"])
model = BertCRF.from_pretrained(bert_name, num_labels=len(dataset['train'].features['ner_tags'].feature.names))
id2label = {i:dataset['train'].features['ner_tags'].feature.names[i] for i in range(len(dataset['train'].features['ner_tags'].feature.names))}
model.config.id2label = id2label
model.config.label2id = {v: k for k, v in id2label.items()}

Sequence(feature=ClassLabel(names=['O', 'B-GPE.NAM', 'I-GPE.NAM', 'B-GPE.NOM', 'I-GPE.NOM', 'B-LOC.NAM', 'I-LOC.NAM', 'B-LOC.NOM', 'I-LOC.NOM', 'B-ORG.NAM', 'I-ORG.NAM', 'B-ORG.NOM', 'I-ORG.NOM', 'B-PER.NAM', 'I-PER.NAM', 'B-PER.NOM', 'I-PER.NOM'], id=None), length=-1, id=None)


model.safetensors:  74%|#######3  | 304M/412M [00:00<?, ?B/s]

Some weights of BertCRF were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight', 'crf.end_transitions', 'crf.start_transitions', 'crf.transitions']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
label_list = dataset["train"].features["ner_tags"].feature.names
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, padding=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = [-100 if word_id is None else label[word_id] for word_id in word_ids]
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer)

Map:   0%|          | 0/1350 [00:00<?, ? examples/s]

Map:   0%|          | 0/270 [00:00<?, ? examples/s]

Map:   0%|          | 0/269 [00:00<?, ? examples/s]

In [6]:
metric = evaluate.load("seqeval")
writer = SummaryWriter(log_dir="./logs")
label_list = dataset["train"].features["ner_tags"].feature.names  # 获取标签名称列表

def compute_metrics(p):
    # 获取 logits，并将它们转换为预测的标签索引
    predictions = np.argmax(p.predictions, axis=2)
    references = p.label_ids

    # 转换为字符串标签，并过滤掉 -100
    true_predictions = [
        [label_list[pred] for (pred, label) in zip(prediction, reference) if label != -100]
        for prediction, reference in zip(predictions, references)
    ]
    true_labels = [
        [label_list[label] for (pred, label) in zip(prediction, reference) if label != -100]
        for prediction, reference in zip(predictions, references)
    ]

    # 计算 seqeval 指标
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [7]:
class TensorBoardCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is None:
            return
        # 将每次日志的损失值记录到 TensorBoard
        for k, v in logs.items():
            if "loss" in k:
                writer.add_scalar(f"Loss/{k}", v, global_step=state.global_step)

In [8]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",                # 指定日志目录
    logging_strategy="steps",            # 设置日志记录策略（这里按步记录）
    logging_steps=10,                    # 每 10 步记录一次日志
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=100,
    weight_decay=0.01,
    fp16=True,
    push_to_hub=True,
    hub_model_id="PassbyGrocer/bert_crf-ner-weibo"
)



In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[TensorBoardCallback()]
)

trainer.train()

  trainer = Trainer(


RuntimeError: CUDA out of memory. Tried to allocate 24.00 MiB (GPU 0; 1.95 GiB total capacity; 1.32 GiB already allocated; 8.81 MiB free; 1.43 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [9]:
test_results = trainer.predict(tokenized_datasets["test"])

# 输出测试结果
print("测试集上的结果:", test_results.metrics)

  _warn_prf(average, modifier, msg_start, len(result))


测试集上的结果: {'test_loss': 0.2595406770706177, 'test_precision': 0.6409090909090909, 'test_recall': 0.6746411483253588, 'test_f1': 0.6573426573426574, 'test_accuracy': 0.9674330793607984, 'test_runtime': 1.837, 'test_samples_per_second': 146.978, 'test_steps_per_second': 4.899}


In [10]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/PassbyGrocer/bert-ner-weibo/commit/1abeb596dd28403edc4966aa6dde61583dadbe60', commit_message='End of training', commit_description='', oid='1abeb596dd28403edc4966aa6dde61583dadbe60', pr_url=None, repo_url=RepoUrl('https://huggingface.co/PassbyGrocer/bert-ner-weibo', endpoint='https://huggingface.co', repo_type='model', repo_id='PassbyGrocer/bert-ner-weibo'), pr_revision=None, pr_num=None)