## BERT解决NER任务

![jupyter](./imgs/bert_ner.png)

In [1]:
import pickle
import warnings

import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
# transformer封装了很多Bert
from transformers import BertTokenizer, TFBertForTokenClassification

import util

warnings.filterwarnings('ignore')

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.720 seconds.
Prefix dict has been built successfully.


## 1. 构建数据集

In [2]:
dataset_save_path = './data/dataset.pkl'

with open(dataset_save_path, 'rb') as f:
    train_sentences, val_sentences, test_sentences, tag_2_id, id_2_tag = pickle.load(f)

len(train_sentences), len(val_sentences), len(test_sentences)

(10048, 700, 1343)

In [3]:
tag_2_id

{'O': 0,
 'B-address': 1,
 'I-address': 2,
 'B-book': 3,
 'I-book': 4,
 'B-company': 5,
 'I-company': 6,
 'B-game': 7,
 'I-game': 8,
 'B-government': 9,
 'I-government': 10,
 'B-movie': 11,
 'I-movie': 12,
 'B-name': 13,
 'I-name': 14,
 'B-organization': 15,
 'I-organization': 16,
 'B-position': 17,
 'I-position': 18,
 'B-scene': 19,
 'I-scene': 20}

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

test_sentence = '李正茂出任中国电信集团有限公司总经理。'

bert_input = tokenizer.encode_plus(
    test_sentence,
    add_special_tokens=True,
    max_length=50,
    pad_to_max_length=True,
    truncation=True,
    return_attention_mask=True,
)

for k, v in bert_input.items():
    print(k)
    print(v)

input_ids
[101, 3330, 3633, 5744, 1139, 818, 704, 1744, 4510, 928, 7415, 1730, 3300, 7361, 1062, 1385, 2600, 5307, 4415, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
token_type_ids
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
attention_mask
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [5]:
def convert_sample_to_feature(text, max_length):
    return tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
    )


def map_sample_to_dict(input_ids, token_type_ids, attention_masks, label):
    return {
               "input_ids": input_ids,
               "token_type_ids": token_type_ids,
               "attention_mask": attention_masks,
           }, label


def build_dataset(samples, tag_2_id, max_length, batch_size, is_train):
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []

    for sample in samples:
        text = [x[0] for x in sample]
        label = [tag_2_id.get(x[2], 0) for x in sample][: max_length - 1]
        # 开头加PAD，即CLS
        label.insert(0, 0)
        bert_input = convert_sample_to_feature(text, max_length)
        input_ids_list.append(bert_input['input_ids'])
        token_type_ids_list.append(bert_input['token_type_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        label_list.append(label)
    label_list = pad_sequences(label_list, padding='post', maxlen=max_length, )
    dataset = tf.data.Dataset.from_tensor_slices(
        (input_ids_list, attention_mask_list, token_type_ids_list, label_list)
    )
    dataset = dataset.map(map_sample_to_dict)
    buffer_size = len(label_list)
    if is_train:
        dataset = dataset.shuffle(buffer_size)
    dataset = dataset.batch(batch_size).prefetch(buffer_size)
    return dataset

In [6]:
BATCH_SIZE = 16
MAX_SEQ_LEN = 52

# dataset
train_dataset = build_dataset(train_sentences, tag_2_id, MAX_SEQ_LEN, BATCH_SIZE, True)
val_dataset = build_dataset(val_sentences, tag_2_id, MAX_SEQ_LEN, BATCH_SIZE, False)
test_dataset = build_dataset(test_sentences, tag_2_id, MAX_SEQ_LEN, BATCH_SIZE, False)

## 2. 模型训练

![jupyter](./imgs/bert_token_classification.jpeg)

In [7]:
NUM_LABELS = len(list(tag_2_id))
LR = 1e-5
EPOCHS = 10
PATIENCE = 2

In [8]:
# 模型初始化
model = TFBertForTokenClassification.from_pretrained(
    'bert-base-chinese',
    from_pt=True,
    num_labels=NUM_LABELS
)
# 定义优化器
optimizer = tf.keras.optimizers.Adam(learning_rate=LR)

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=[metric]
)

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy',
    patience=PATIENCE,
    restore_best_weights=True
)

# 模型训练
bert_history = model.fit(
    train_dataset,
    epochs=EPOCHS,
    callbacks=[callback],
    validation_data=val_dataset
)

Epoch 1/10
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


In [10]:
# 保存模型
save_model_path = "./bert/bert_ner"
model.save_pretrained(save_model_path, saved_model=True)

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: ./bert/bert_ner/saved_model/1/assets


## 3. 模型评估

In [11]:
output = model.predict(test_dataset)
pred_logits = output.logits
pred_label_ids = np.argmax(pred_logits, axis=2).tolist()

preds, trues = [], []

for sample, pred_ids in zip(test_sentences, pred_label_ids):
    label = [x[2] for x in sample]
    seq_len = len(label)  # 获取序列真实长度
    pred_label = [id_2_tag[x] for x in pred_ids[1: seq_len + 1]]  # 开头0为CLS，所以从1开始取
    assert len(label) == len(pred_label), (label, pred_label)
    preds.extend(pred_label)
    trues.extend(label)

# 对结果进行评估
metric_result = util.measure_by_tags(trues, preds)


                precision    recall  f1-score   support

        B-name       0.85      0.89      0.87       352
  I-government       0.77      0.84      0.80       855
        B-game       0.81      0.79      0.80       226
        I-book       0.92      0.82      0.87       715
        I-name       0.83      0.89      0.86       732
        B-book       0.88      0.81      0.84       121
       B-movie       0.80      0.80      0.80       101
       I-scene       0.69      0.73      0.71       458
       B-scene       0.62      0.64      0.63       124
     I-address       0.77      0.73      0.75      1045
        I-game       0.83      0.79      0.81      1065
    I-position       0.79      0.77      0.78       610
    B-position       0.77      0.75      0.76       347
     B-address       0.68      0.64      0.66       273
  B-government       0.77      0.81      0.78       190
I-organization       0.71      0.60      0.65       688
     I-company       0.72      0.81      0.76 

## 4. 模型预测

In [12]:
# 加载模型
save_model_path = "./bert/bert_ner"
saved_model = TFBertForTokenClassification.from_pretrained(save_model_path)

Some layers from the model checkpoint at ./bert/bert_ner were not used when initializing TFBertForTokenClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForTokenClassification were initialized from the model checkpoint at ./bert/bert_ner.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForTokenClassification for predictions without further training.


In [13]:
# 使用模型进行预测
predict_sentences = ['李正茂出任中国电信集团有限公司总经理。',
                     '2012年成立中国电信国际有限公司,总部设于中国香港。',
                     '《长津湖》将于今年下半年上映。']

# tokenizer
predict_inputs = tokenizer(predict_sentences, padding=True, max_length=MAX_SEQ_LEN, return_tensors="tf")
# 模型前向运算
output = saved_model(predict_inputs)
# 获取标签分数
predict_logits = output.logits.numpy()
# 取最大标签分数结果
predict_label_ids = np.argmax(predict_logits, axis=2).tolist()

In [14]:
# 格式化展示结果
for text, pred_ids in zip(predict_sentences, predict_label_ids):
    print(text)
    seq_len = len(text)
    bio_seq = [id_2_tag[x] for x in pred_ids[1: seq_len + 1]]
    print(bio_seq)
    entities_result = util.bio_2_entities(bio_seq)
    json_result = util.formatting_result(entities_result, text)
    print(json_result)

李正茂出任中国电信集团有限公司总经理。
['B-name', 'I-name', 'I-name', 'O', 'O', 'B-company', 'I-company', 'I-company', 'I-company', 'I-company', 'I-company', 'I-company', 'I-company', 'I-company', 'I-company', 'B-position', 'I-position', 'I-position', 'O']
[
    {
        "begin": 0,
        "end": 3,
        "tag": "name",
        "word": "李正茂"
    },
    {
        "begin": 5,
        "end": 15,
        "tag": "company",
        "word": "中国电信集团有限公司"
    },
    {
        "begin": 15,
        "end": 18,
        "tag": "position",
        "word": "总经理"
    }
]
2012年成立中国电信国际有限公司,总部设于中国香港。
['O', 'O', 'O', 'O', 'B-company', 'I-company', 'I-company', 'I-company', 'I-company', 'I-company', 'I-company', 'I-company', 'I-company', 'I-company', 'O', 'O', 'O', 'O', 'O', 'B-address', 'I-address', 'I-address', 'I-address', 'O', 'O']
[
    {
        "begin": 4,
        "end": 14,
        "tag": "company",
        "word": "年成立中国电信国际有"
    },
    {
        "begin": 19,
        "end": 23,
        "tag": "address",
       