## BiLSTM-CRF解决NER任务

### 深度学习解决NLP监督任务  
![jupyter](./imgs/dl_for_nlp.png)

### BiLSTM-CRF网络结构  
![jupyter](./imgs/bilstm_crf.png)

In [1]:
import json
import pickle
import tensorflow as tf
from sklearn.model_selection import train_test_split

import util

import warnings
warnings.filterwarnings('ignore')

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/96/0jnny1gj02j8pgfvb9by368m0000gn/T/jieba.cache
Loading model cost 0.637 seconds.
Prefix dict has been built successfully.


## 1. 构造数据集

In [2]:
train_data_file = './bio/train.bio'
test_data_file = './bio/test.bio'

# 读取bio格式数据
raw_train_sentences = util.read_bio_data(train_data_file)
test_sentences = util.read_bio_data(test_data_file)
len(raw_train_sentences), len(test_sentences)

(10748, 1343)

In [3]:
# 划分出验证集
# 深度学习，划分验证集的大小和测试集差不多即可
train_sentences, val_sentences = train_test_split(
    raw_train_sentences, test_size=700, random_state=1234
)
len(train_sentences), len(val_sentences)

(10048, 700)

In [4]:
test_sentences[0][:10]

[['彭', 'B-nr', 'B-name'],
 ['小', 'I-nr', 'I-name'],
 ['军', 'E-nr', 'I-name'],
 ['认', 'B-v', 'O'],
 ['为', 'E-v', 'O'],
 ['，', 'S-x', 'O'],
 ['国', 'B-s', 'O'],
 ['内', 'E-s', 'O'],
 ['银', 'B-n', 'O'],
 ['行', 'E-n', 'O']]

In [5]:
# bio标签到id的映射
tag_2_id = {
    'O': 0,
    'B-address': 1,
    'I-address': 2,
    'B-book': 3,
    'I-book': 4,
    'B-company': 5,
    'I-company': 6,
    'B-game': 7,
    'I-game': 8,
    'B-government': 9,
    'I-government': 10,
    'B-movie': 11,
    'I-movie': 12,
    'B-name': 13,
    'I-name': 14,
    'B-organization': 15,
    'I-organization': 16,
    'B-position': 17,
    'I-position': 18,
    'B-scene': 19,
    'I-scene': 20
}

# id到bio标签的映射
id_2_tag = {v: k for k, v in tag_2_id.items()}

In [6]:
# 将数据集以及标签id之间的映射序列化保存，供后续的模型使用
dataset_save_path = './data/dataset.pkl'
with open(dataset_save_path, 'wb') as f:
    data = (train_sentences, val_sentences, test_sentences, tag_2_id, id_2_tag)
    pickle.dump(data, f)

In [7]:
# 如何读取序列化后的数据
with open(dataset_save_path, 'rb') as f:
    train_sentences, val_sentences, test_sentences, tag_2_id, id_2_tag = pickle.load(f)
    
len(train_sentences), len(val_sentences), len(test_sentences)

(10048, 700, 1343)

In [8]:
# 将训练语料中的字取出来
words = set()
for sentence in train_sentences:
    for word, _, _ in sentence:
        words.add(word.lower())
words = list(words)
words.insert(0, '<UNK>')  # 先在0位置插入unk
words.insert(0, '<PAD>')  # 再在0位置插入PAD
# word映射为id，即tokenizer
word_2_id = {word: index for index, word in enumerate(words)}

len(words), word_2_id['中'], words[:10]

(3586, 2116, ['<PAD>', '<UNK>', '朱', '穆', '素', '淹', '橘', '徊', '瞅', '妖'])

In [9]:
# 序列化保存word与id之间的映射关系
save_path = './bilstm_crf/maps.pkl'
with open(save_path, 'wb') as f:
    data = (words, word_2_id)
    pickle.dump(data, f)

In [10]:
# 读取word与id之间的映射关系
save_path = './bilstm_crf/maps.pkl'
with open(save_path, 'rb') as f:
    words, word_2_id = pickle.load(f)

## 2. 准备TF数据集

In [11]:
MAX_SEQ_LEN = 50 # 最长序列长度
BATCH_SIZE = 64  

In [12]:
# 制作TF数据集
def build_dataset(bio_sequences, word_2_id, tag_2_id, max_seq_len, batch_size, is_train):
    text_seqs = []
    tag_seqs = []
    for seq in bio_sequences:
        current_text_seq = []
        current_tag_seq = []
        for word, _, tag in seq:
            # 不在训练词表中，则获取unk
            current_text_seq.append(word_2_id.get(word, 1))
            current_tag_seq.append(tag_2_id.get(tag, 0))
        text_seqs.append(current_text_seq)
        tag_seqs.append(current_tag_seq)
    pad_text_seqs = tf.keras.preprocessing.sequence.pad_sequences(text_seqs, padding='post', maxlen=max_seq_len)
    pad_tag_seqs = tf.keras.preprocessing.sequence.pad_sequences(tag_seqs, padding='post', maxlen=max_seq_len)
    dataset = tf.data.Dataset.from_tensor_slices((pad_text_seqs, pad_tag_seqs))
    if is_train:
        buffer_size = len(pad_tag_seqs)
        dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True).prefetch(buffer_size)
    else:
        dataset = dataset.batch(batch_size, drop_remainder=False).prefetch(batch_size)
    return dataset

In [13]:
train_dataset = build_dataset(train_sentences, word_2_id, tag_2_id, MAX_SEQ_LEN, BATCH_SIZE, is_train=True)
val_dataset = build_dataset(val_sentences, word_2_id, tag_2_id, MAX_SEQ_LEN, BATCH_SIZE, is_train=False)
test_dataset = build_dataset(test_sentences, word_2_id, tag_2_id, MAX_SEQ_LEN, BATCH_SIZE, is_train=False)

In [14]:
for example, label in val_dataset.take(1):
    print('texts: ', example.numpy()[:2])
    print()
    print('labels: ', label.numpy()[:2])

texts:  [[2918 3314 2605 2392 1965 1573 2116 3116 3456 2468  187 2089  141 2100
  1203 2462 2899 1581   30 2070 1091  188  548 1888 2704 1353 1749 1729
  3242 3508 2162 3021 2395  660 2899 1338 2933 3541    0    0    0    0
     0    0    0    0    0    0    0    0]
 [2007 2936 2392 2116 3250  341 1176 2899 2311 2820 2795   83 2070 1681
  2580  196 3335 2392  548 1888 2704  263 2936 2266 2915 1081  263 3366
   827 3335 2899 3007 2400 2509 2116 1983 2070    0    0    0    0    0
     0    0    0    0    0    0    0    0]]

labels:  [[13 14 14  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  9 10
  10 10  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  5  6  6  6  0  1  2  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0]]


## 3. 模型训练

![jupyter](./imgs/bilstm_crf_all.png)

In [15]:
import tensorflow as tf
import tensorflow_addons as tf_ad


class NerModel(tf.keras.Model):
    def __init__(self, lstm_dim, embedding_size, vocab_size, label_size, dropout_rate=0.5):
        super(NerModel, self).__init__()
        self.lstm_dim = lstm_dim  # lstm维度
        self.vocab_size = vocab_size  # word embedding词表大小
        self.label_size = label_size  # 标签数量
        self.dropout_rate = dropout_rate # dropout比例
        # 定义embedding层
        self.embedding = tf.keras.layers.Embedding(
            vocab_size,
            embedding_size,
            embeddings_regularizer='l2',
            # 用0做mask
            mask_zero=True
        )
        # dropout层
        self.dropout = tf.keras.layers.Dropout(self.dropout_rate)
        # BiLSTM层
        self.biLSTM = tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(
                self.lstm_dim,
                #不是拿最终的输出，是每个状态的输出都是需要取出来的
                return_sequences=True,
                activation='tanh',
                activity_regularizer='l2',
                dropout=self.dropout_rate
            )
        )
        # 标签分类层，提取发射分数
        self.dense = tf.keras.layers.Dense(
            self.label_size, activation='relu', activity_regularizer='l2'
        )
        # 定义CRF转移矩阵，提取转移分数
        self.transition_params = tf.Variable(
            tf.random.uniform(shape=(self.label_size, self.label_size))
        )

    def call(self, text, labels=None, training=None):
        # 获取原始文本的真实长度，即token id不为0的长度(之前做了padding)
        text_lens = tf.math.reduce_sum(tf.cast(tf.math.not_equal(text, 0), dtype=tf.int32), axis=-1)
        # embedding
        inputs = self.embedding(text)
        # dropout
        X = self.dropout(inputs, training)
        # bilstm特征抽取
        X = self.biLSTM(X)
        # 发射分数
        logits = self.dense(X)
        # 如果label不为空，可以算loss
        if labels is not None:
            # 将标签序列转化为tf tensor
            label_sequences = tf.convert_to_tensor(labels, dtype=tf.int32)
            # 使用tf_ad.text.crf_log_likelihood定义crf层，获取crf loss以及更新转移矩阵
            log_likelihood, self.transition_params = tf_ad.text.crf_log_likelihood(
                inputs=logits,
                tag_indices=label_sequences,
                sequence_lengths=text_lens,
                transition_params=self.transition_params
            )
            # 返回发射分数，文本真实长度，crf loss
            return logits, text_lens, log_likelihood
        else:
             # 返回发射分数，文本真实长度(还原真实文本用的)
            return logits, text_lens

In [16]:
LSTM_DIM = 512
EMBEDDING_DIM = 128
DROPOUT = 0.5
LR = 2e-3

vocab_size = len(words)
label_size= len(tag_2_id)

# 初始化bilstm crf模型
model = NerModel(
    LSTM_DIM,
    EMBEDDING_DIM,
    vocab_size,
    label_size,
    DROPOUT)

# 定义优化器
optimizer = tf.keras.optimizers.Adam(LR)

![jupyter](./imgs/bilstm_crf_score.png)

In [17]:
# 定义模型的check ponit
output_dir = './bilstm_crf'
ckpt = tf.train.Checkpoint(optimizer=optimizer, model=model)
ckpt.restore(tf.train.latest_checkpoint(output_dir))
ckpt_manager = tf.train.CheckpointManager(ckpt,
                                          output_dir,
                                          checkpoint_name='model.ckpt',
                                          max_to_keep=3    # 最多保存3步
                                         )

# 定义一次batch计算过程
def run_one_step(model, text_batch, labels_batch, training=True):
    with tf.GradientTape() as tape:
        # 取出模型前向运算的发射分数、文本真实长度、crf loss
        logits, text_lens, log_likelihood = model(
            text_batch, labels_batch, training
        )
        # 将batch的crf loss进行平均
        loss = - tf.reduce_mean(log_likelihood)
    if training:
        # 如果是训练，需要通过优化器进行梯度的更新
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    # 验证、测试阶段无需更新梯度
    return loss, logits, text_lens


# 定义模型预测
def predict_result(model, dataset, id_2_tag):
    # 初始化loss、预测标签、真实标签列表
    losses, preds, trues = [], [], []
    # 对dataset进行batch计算
    for _, (text_batch, labels_batch) in enumerate(dataset):
        # 进行一次前向计算，获取crf loss、发射分数、文本真实长度
        loss, logits, text_lens = run_one_step(model, text_batch, labels_batch, False)
        losses.append(loss)
        for logit, text_len, labels in zip(logits, text_lens, labels_batch):
            # 根据序列真实长度使用维特比解码出最优序列
            viterbi_path, _ = tf_ad.text.viterbi_decode(logit[:text_len], model.transition_params)
            # 将最优序列作为预测序列
            preds.extend(viterbi_path)
            # 还原真实的标签序列
            trues.extend(labels.numpy()[: text_len])
    # 将标签id还原为标签
    true_bios = [id_2_tag[i] for i in trues] 
    predict_bios = [id_2_tag[i] for i in preds] 
    return true_bios, predict_bios, losses

# 结果评价，主要用于训练过程中查看验证集结果
def metrics(model, dataset, tags):
    true_bios, predict_bios, losses = predict_result(model, dataset, tags)
    f1_score = util.get_f1_score(true_bios, predict_bios)  # 基于实体的f1 score
    avg_loss = sum(losses) / len(losses) # 平均的loss
    return f1_score, avg_loss

In [18]:
EPOCHS = 20  # 迭代次数
best_f1 = 0.0  # 记录最优的f1 score
step = 0 # 记录训练步数
early_stop_step = 0 # 记录早停步数
STOP_STEP = 10 # 设置早停等待步数

# 实现early stopping的代码
for epoch in range(EPOCHS):
    for _, (text_batch, labels_batch) in enumerate(train_dataset):
        step = step + 1
        # 一次训练过程，只取出loss
        loss, _, _ = run_one_step(model, text_batch, labels_batch, True)
        # 每隔50步打印训练的中间结果
        if step % 50 == 0:
            print(f'Epoch {epoch}, step {step}, train_loss {loss}')
            if epoch > 5:  # 从第5个epoch开始计算验证集结果
                # 计算验证集的实体分类f1 score，以及loss
                f1_score, avg_loss = metrics(model, val_dataset, id_2_tag)
                print(f'Validation Result: val_f1 {f1_score}, val_loss {avg_loss}')
                # 记录最优的f1 score
                if f1_score > best_f1:
                    best_f1 = f1_score
                    ckpt_manager.save()  # 记录最优时模型的权重
                    print(f'New best f1: {best_f1}, model saved!')
                    early_stop_step = 0
                else:
                    early_stop_step += 1
                # 连续一定步数最优f1不再变化，则早停
                if early_stop_step > STOP_STEP:
                    print('Early stoped!')
                    break
    if early_stop_step > STOP_STEP:
        break

print("Train finished")

Epoch 0, step 50, train_loss 41.4876708984375
Epoch 0, step 100, train_loss 35.505340576171875
Epoch 0, step 150, train_loss 28.315336227416992
Epoch 1, step 200, train_loss 27.595495223999023
Epoch 1, step 250, train_loss 21.613262176513672
Epoch 1, step 300, train_loss 22.24992561340332
Epoch 2, step 350, train_loss 18.592220306396484
Epoch 2, step 400, train_loss 19.803665161132812
Epoch 2, step 450, train_loss 15.14367961883545
Epoch 3, step 500, train_loss 14.602776527404785
Epoch 3, step 550, train_loss 12.497684478759766
Epoch 3, step 600, train_loss 11.619424819946289
Epoch 4, step 650, train_loss 9.96174430847168
Epoch 4, step 700, train_loss 10.135090827941895
Epoch 4, step 750, train_loss 8.222332954406738
Epoch 5, step 800, train_loss 7.762351989746094
Epoch 5, step 850, train_loss 10.15503215789795
Epoch 5, step 900, train_loss 6.439980506896973
Epoch 6, step 950, train_loss 8.371956825256348
Validation Result: val_f1 0.4992232004142931, val_loss 8.056118965148926
New best

## 4. 模型评估

In [19]:
# 查看模型结构
# 每一层参数和总参数相差441，sqrt(441) = 21，summary并没有展现转移矩阵的训练。
model.summary()

Model: "ner_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  459008    
_________________________________________________________________
dropout (Dropout)            multiple                  0         
_________________________________________________________________
bidirectional (Bidirectional multiple                  2625536   
_________________________________________________________________
dense (Dense)                multiple                  21525     
Total params: 3,106,510
Trainable params: 3,106,510
Non-trainable params: 0
_________________________________________________________________


In [20]:
# 使用训练集进行模型评估
true_bios, predict_bios, _ = predict_result(model, test_dataset, id_2_tag)
metric_result = util.measure_by_tags(true_bios, predict_bios)


                precision    recall  f1-score   support

     B-address       0.55      0.55      0.55       273
        B-game       0.67      0.81      0.73       226
     I-address       0.62      0.70      0.66      1045
     B-company       0.71      0.58      0.64       279
B-organization       0.61      0.50      0.55       206
       I-scene       0.64      0.47      0.54       458
        I-book       0.77      0.69      0.73       715
        B-name       0.72      0.67      0.69       352
       B-scene       0.61      0.39      0.47       124
       B-movie       0.63      0.48      0.54       101
        B-book       0.70      0.63      0.66       121
I-organization       0.63      0.52      0.57       688
    I-position       0.71      0.60      0.65       610
     I-company       0.66      0.62      0.64      1031
  I-government       0.67      0.82      0.74       855
       I-movie       0.66      0.59      0.62       580
        I-game       0.70      0.82      0.75 

In [21]:
# 可视化模型训练出的转移矩阵

import pandas as pd
transitions = list(model.transition_params.numpy())

data = []
for i, trans_list in enumerate(transitions):
    row = {'From\To': id_2_tag[i]}
    for j, trans_score in enumerate(trans_list):
        row[id_2_tag[j]] = round(trans_score, 2)
    data.append(row)
    
pd.DataFrame(data)

Unnamed: 0,From\To,O,B-address,I-address,B-book,I-book,B-company,I-company,B-game,I-game,...,B-movie,I-movie,B-name,I-name,B-organization,I-organization,B-position,I-position,B-scene,I-scene
0,O,1.54,0.62,-1.75,-0.39,-1.3,0.46,-1.6,1.01,-1.47,...,0.97,-1.35,0.57,-2.15,1.57,-2.36,0.65,-2.62,1.16,-1.98
1,B-address,-1.3,-1.02,2.52,-0.01,-0.68,-0.72,-2.07,-0.31,-1.52,...,-0.56,-1.25,-0.98,-1.38,0.0,-1.6,-0.64,-1.08,-1.0,-2.26
2,I-address,-1.41,-1.92,2.14,-0.68,-1.68,-0.63,-2.51,-0.95,-1.0,...,-0.47,-1.49,-0.76,-2.06,-1.1,-2.4,-0.57,-2.22,-0.99,-2.49
3,B-book,-1.12,-0.19,-0.73,-0.54,2.93,0.09,-0.92,-0.23,-1.61,...,-0.34,-1.6,-0.6,-1.45,-0.24,-1.12,-0.75,-1.19,0.39,-0.52
4,I-book,-1.05,-0.06,-1.33,-1.24,2.12,-0.63,-0.94,-0.61,-1.81,...,-0.5,-2.1,-0.49,-1.54,-0.86,-2.04,-0.06,-1.26,-0.77,-0.93
5,B-company,-1.47,-0.12,-1.94,-0.21,-1.58,-0.81,2.46,-0.06,-1.22,...,-0.42,-1.26,-0.07,-1.76,0.06,-1.76,-0.21,-1.57,-0.61,-1.71
6,I-company,-1.28,-0.06,-2.22,-0.76,-1.09,-2.24,2.37,0.2,-1.9,...,-0.36,-1.48,-0.07,-1.34,0.12,-2.44,0.24,-1.85,-0.24,-1.44
7,B-game,-1.29,-0.35,-1.84,0.06,-1.03,0.16,-1.28,0.16,3.04,...,-0.39,-2.27,-0.48,-1.36,-0.56,-2.28,-0.12,-0.59,-0.13,-1.19
8,I-game,-1.14,-0.79,-1.46,-0.67,-1.75,-0.19,-1.65,-1.44,1.64,...,-0.97,-2.03,-0.55,-1.18,-0.1,-2.02,-0.05,-0.68,-0.05,-0.65
9,B-government,-1.63,-0.25,-1.94,0.03,-0.67,-0.07,-1.37,-0.03,-0.82,...,-0.15,-1.48,0.11,-0.6,-0.2,-2.22,0.01,-1.68,0.19,-0.57


## 5. 模型预测

In [22]:
# 加载模型进行预测
output_dir = './bilstm_crf'
saved_model = NerModel(LSTM_DIM,
                 EMBEDDING_DIM,
                 vocab_size,
                 label_size,
                 DROPOUT)

optimizer = tf.keras.optimizers.Adam(LR)

# 从Checkpoint中还原模型权重
ckpt = tf.train.Checkpoint(optimizer=optimizer, model=saved_model)
ckpt.restore(tf.train.latest_checkpoint(output_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fa87086ac10>

In [23]:
# 在线预测
test_sentences = ['李正茂出任中国电信集团有限公司总经理。',
                 '2012年成立中国电信国际有限公司,总部设于中国香港。',
                 '《长津湖》将于今年下半年上映。']
tokenizer_sentences = []
# 进行tokenizer
for sentence in test_sentences:
    tokenizer_sentences.append(
        [word_2_id.get(word.lower(), 0) for word in sentence]
    )
# padding
dataset = tf.keras.preprocessing.sequence.pad_sequences(
    tokenizer_sentences, padding='post'
)
print(dataset)

# 使用模型进行预测
logits, text_lens = saved_model.predict(dataset)
paths = []
for logit, text_len in zip(logits, text_lens):
    # 维特比解码出最优序列
    viterbi_path, _ = tf_ad.text.viterbi_decode(logit[:text_len], saved_model.transition_params)
    paths.append(viterbi_path)

[[ 571 2362 3051   90 3077 2116 3250 2761 1473 3170   29 1478 2859 3242
  2382 1567 1514 1818 3541    0    0    0    0    0    0    0    0]
 [3273 1173  731 3273  823 1331  263 2116 3250 2761 1473 3250   40 1478
  2859 3242 2382   67 1567  347 1480 3366 2116 3250  808 1288 3541]
 [2287 1138 3324 3383 2416 1240 3366 1479  823 1452 1768  823 2288 1171
  3541    0    0    0    0    0    0    0    0    0    0    0    0]]


In [24]:
# 结果展示
for text, path in zip(test_sentences, paths):
    print(text)
    bio_seq = [id_2_tag[tag_id] for tag_id in path]
    print(bio_seq)
    entities_result = util.bio_2_entities(bio_seq)
    json_result = util.formatting_result(entities_result, text)
    print(json_result)

李正茂出任中国电信集团有限公司总经理。
['B-name', 'I-name', 'I-name', 'O', 'O', 'B-company', 'I-company', 'I-company', 'I-company', 'I-company', 'I-company', 'I-company', 'I-company', 'I-company', 'I-company', 'B-position', 'I-position', 'I-position', 'O']
[
    {
        "begin": 0,
        "end": 3,
        "tag": "name",
        "word": "李正茂"
    },
    {
        "begin": 5,
        "end": 15,
        "tag": "company",
        "word": "中国电信集团有限公司"
    },
    {
        "begin": 15,
        "end": 18,
        "tag": "position",
        "word": "总经理"
    }
]
2012年成立中国电信国际有限公司,总部设于中国香港。
['O', 'O', 'O', 'O', 'O', 'O', 'B-company', 'I-company', 'I-company', 'I-company', 'I-company', 'I-company', 'I-company', 'I-company', 'I-company', 'I-company', 'I-company', 'O', 'O', 'O', 'O', 'O', 'B-address', 'I-address', 'I-address', 'I-address', 'O']
[
    {
        "begin": 6,
        "end": 17,
        "tag": "company",
        "word": "立中国电信国际有限公司"
    },
    {
        "begin": 22,
        "end": 26,
        "tag":