NER example for RoBERTa.  
The data is from People's Daily.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%tensorflow_version 2.x

import os
import warnings
import time
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from seqeval import metrics

os.chdir('./drive/My Drive/Python/Research/bert')
warnings.filterwarnings('ignore')

import mymodels as mm

In [None]:
MODEL = 'roberta'
VOCAB = 'models/roberta_base_ch/vocab.txt'
CONFIG = 'models/roberta_base_ch/bert_config.json'
CKPT = 'models/roberta_base_ch/bert_model.ckpt'
USECRF = True
MAXLEN = 128
DROP = 0.5
BATCH = 32
EPOCH = 10
LRATE = 2e-5
DRATE = 1e-2
LABEL = {
  'O': 0,
  'B-PER': 1,
  'I-PER': 2,
  'B-ORG': 3,
  'I-ORG': 4,
  'B-LOC': 5,
  'I-LOC': 6}

In [None]:
def data_processing(file, tokenizer, label, maxlen, batch, training):
  file1 = open(file, encoding='utf-8').read().split('\n\n')
  file1 = [f1 for f1 in file1 if len(f1) > 0]
  token0, seg0, mask0, label0 = [], [], [], []

  for sample1 in file1:
    token1, label1 = [], []
    list1 = sample1.split('\n')

    if len(list1) > maxlen-2:
      continue

    for pair1 in list1:
      pair2 = pair1.split(' ')
      token1.append(tokenizer.vocab.get(pair2[0], tokenizer.vocab['[UNK]']))
      label1.append(label[pair2[1]])

    len1 = len(token1)
    token0.append([101]+token1+[102]+[0]*(maxlen-len1-2))
    seg0.append([0]*maxlen)
    mask0.append([0]*(len1+2)+[1]*(maxlen-len1-2))
    label0.append([0]+label1+[0]+[0]*(maxlen-len1-2))

  token0, seg0, mask0 = np.array(token0), np.array(seg0), np.array(mask0)
  label0 = np.eye(len(label.keys()))[label0]
  data1 = tf.data.Dataset.from_tensor_slices((token0, seg0, mask0, label0))
  return data1.shuffle(len(token0)).batch(batch) if training else data1.batch(batch), len(token0)


def label_processing(file, maxlen):
  file1, data1 = open(file, encoding='utf-8').read().split('\n\n'), []
  file1 = [f1 for f1 in file1 if len(f1) > 0]

  for i1 in file1:
    list1 = [j1.split(' ')[1] for j1 in i1.split('\n')]

    if len(list1) > maxlen-2:
      continue
    else:
      data1.append(list1)

  return data1


tokenizer_1 = mm.Tokenizer()
tokenizer_1.loading(VOCAB)
path_1 = 'tasks/datasets/people_daily/'
training_1, len_1 = data_processing(path_1+'example.train', tokenizer_1, LABEL, MAXLEN, BATCH, True)
dev_1, len_2 = data_processing(path_1+'example.dev', tokenizer_1, LABEL, MAXLEN, BATCH, False)
test_1, len_3 = data_processing(path_1+'example.test', tokenizer_1, LABEL, MAXLEN, BATCH, False)
devlabel_1 = label_processing(path_1+'example.dev', MAXLEN)
testlabel_1 = label_processing(path_1+'example.test', MAXLEN)

In [None]:
class ModelBERT(keras.Model):
  def __init__(self, model, config, drop, category, crf=False):
    super(ModelBERT, self).__init__()
    self.bert = mm.BERT(config, model, 'seq')
    self.drop = keras.layers.Dropout(drop)
    self.dense = keras.layers.Dense(category, None if crf else 'softmax')
    self.crf = mm.CRF(category) if crf else None

  def propagating(self, text, segment, mask, training):
    x1 = self.bert.propagating(text, segment, mask, training)
    return self.dense(self.drop(x1, training=training))


class ModelNER(object):
  def __init__(self, model, tokenizer, label):
    self.tokenizer, self.model = tokenizer, model
    self.crf = True if self.model.crf is not None else False
    self.vocab = list(self.tokenizer.vocab.keys())
    self.map = list(label.keys())

  def predicting(self, text, seg, mask):
    pred1 = self.model.propagating(text, seg, mask, False)
    collection1 = []

    if self.crf:
      for i1 in self.model.crf.decoding(pred1, mask):
        tag1 = [self.map[int(j1)] for j1 in i1]
        collection1.append(tag1[1:-1])

    else:
      for i1, seq1 in enumerate(np.argmax(pred1, 2).tolist()):
        mask1 = np.argmax(np.array(mask[i1]).tolist()+[1])
        tag1 = [self.map[j1] for j1 in seq1[:mask1]]
        collection1.append(tag1[1:-1])

    return collection1

  def detecting(self, text, seg, mask):
    seq1 = self.predicting(text, seg, mask)
    collection1 = []

    for i1, text1 in enumerate(text):
      entity1 = []

      for j1, tag1 in enumerate(seq1[i1]):
        if tag1 != 'O':
          if tag1[0] == 'B':
            start1 = True
            entity1.append([self.vocab[text1[j1+1]], tag1[2:]])
          elif start1:
            entity1[-1][0] += self.vocab[text1[j1+1]]
          else:
            start1 = False
          
        else:
          start1 = False

      collection1.append(entity1)

    return collection1

  def testing(self, sentence, maxlen=64):
    text1, seg1, mask1 = self.tokenizer.encoding(sentence, maxlen=maxlen)
    return self.detecting(np.array([text1]), np.array([seg1]), np.array([mask1]))


model_1 = ModelBERT(MODEL, CONFIG, DROP, len(LABEL.keys()), USECRF)
model_1.bert.loading(CKPT)
ner_1 = ModelNER(model_1, tokenizer_1, LABEL)
function_1 = keras.losses.CategoricalCrossentropy(reduction=keras.losses.Reduction.NONE)
loss_1 = tf.keras.metrics.Mean(name='training_loss')
step_1 = EPOCH*(len_1/BATCH+1)
optimizer_1 = mm.AdamWV2(step_1, LRATE, drate=DRATE)

In [None]:
@tf.function
def step_training(text, segment, mask, y, crf):
  with tf.GradientTape() as tape_1:
    pred_1 = model_1.propagating(text, segment, mask, True)

    if crf:
      cal_1 = model_1.crf.calculating(y, pred_1, mask)
      value_1 = tf.reduce_mean(cal_1)
    else:
      m_1 = tf.cast((1-mask), tf.float32)
      value_1 = tf.reduce_sum(function_1(y, pred_1)*m_1)/tf.reduce_sum(m_1)

  grad_1 = tape_1.gradient(value_1, model_1.trainable_variables)
  grad_1, _ = tf.clip_by_global_norm(grad_1, 1.0)
  optimizer_1.apply_gradients(zip(grad_1, model_1.trainable_variables))
  loss_1(value_1)


def step_evaluating(data, label, model):
  pred1 = []

  for x1, x2, x3, y1 in data:
    pred1 += model.predicting(x1, x2, x3)

  return metrics.f1_score(label, pred1), metrics.classification_report(label, pred1)


temp_1 = 'Training loss is {:.4f}, and step cost is {:.4f}.'
temp_2 = 'Dev F1 score is {:.4f}, and epoch cost is {:.4f}.'
count_1 = 0

for e_1 in range(EPOCH):
  print('Epoch {} running.'.format(e_1+1))
  time_0 = time.time()

  for x_1, x_2, x_3, y_1 in training_1:
    time_1, count_1 = time.time(), count_1+1
    step_training(x_1, x_2, x_3, y_1, USECRF)

    if count_1 % 200 == 0:
      print(temp_1.format(float(loss_1.result()), time.time()-time_1))

  f_1, _ = step_evaluating(dev_1, devlabel_1, ner_1)
  print(temp_2.format(f_1, time.time()-time_0))
  print('**********')

Epoch 1 running.
Training loss is 33.5846, and step cost is 0.5706.
Training loss is 18.1014, and step cost is 0.5659.
Training loss is 12.6950, and step cost is 0.5678.
Dev F1 score is 0.9153, and epoch cost is 398.1359.
**********
Epoch 2 running.
Training loss is 9.8677, and step cost is 0.5665.
Training loss is 8.1282, and step cost is 0.5592.
Training loss is 6.9662, and step cost is 0.5701.
Dev F1 score is 0.9273, and epoch cost is 380.0600.
**********
Epoch 3 running.
Training loss is 6.0804, and step cost is 0.5674.
Training loss is 5.3927, and step cost is 0.5688.
Training loss is 4.8628, and step cost is 0.5644.
Dev F1 score is 0.9527, and epoch cost is 379.7818.
**********
Epoch 4 running.
Training loss is 4.4234, and step cost is 0.5675.
Training loss is 4.0556, and step cost is 0.5832.
Training loss is 3.7500, and step cost is 0.5656.
Dev F1 score is 0.9566, and epoch cost is 379.7569.
**********
Epoch 5 running.
Training loss is 3.4889, and step cost is 0.5687.
Training l

In [None]:
f_1, c_1 = step_evaluating(test_1, testlabel_1, ner_1)
print(c_1)

           precision    recall  f1-score   support

      ORG       0.93      0.94      0.93      2066
      LOC       0.96      0.96      0.96      3291
      PER       0.98      0.98      0.98      1668

micro avg       0.95      0.96      0.96      7025
macro avg       0.95      0.96      0.96      7025



In [None]:
ner_1.testing('陈旺财已经从匹兹堡回到上海了。')[0]

[['陈旺财', 'PER'], ['匹兹堡', 'LOC'], ['上海', 'LOC']]