A text classification example for BERT.  
The data is from https://github.com/FudanNLP/nlpcc2017_news_headline_categorization.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%tensorflow_version 2.x

import os
import warnings
import time
import json
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras
from sklearn import preprocessing

os.chdir('./drive/My Drive/Python/Research/bert')
warnings.filterwarnings('ignore')

import mymodels as mm

In [None]:
MODEL = 'bert'
MODE = 'cls'
MAXLEN = 40
CATE = 18
DROP = 0.5
DIM = 128
LRATE = 5e-5
BATCH = 64
EPOCH = 3
VOCAB = 'models/bert_base_ch/vocab.txt'
CONFIG = 'models/bert_base_ch/bert_config.json'
CKPT = 'models/bert_base_ch/bert_model.ckpt'

In [None]:
trai_1 = pd.read_table('tasks/datasets/nlpcc_2017_news/train.txt', names=['label', 'text'])
deve_1 = pd.read_table('tasks/datasets/nlpcc_2017_news/dev.txt', names=['label', 'text'])
test_1 = pd.read_table('tasks/datasets/nlpcc_2017_news/test.txt', names=['label', 'text'])

labe_1 = preprocessing.LabelEncoder()
trai_1['label'] = labe_1.fit_transform(trai_1['label'])
deve_1['label'] = labe_1.transform(deve_1['label'])
test_1['label'] = labe_1.transform(test_1['label'])

print(trai_1.head())

   label                                               text
0      3      台 媒 预 测 周 冬 雨 金 马 奖 封 后 ， 大 气 的 倪 妮 却 佳 作 难 出
1      7  农 村 就 是 好 ， 能 吃 到 纯 天 然 无 添 加 的 野 生 蜂 蜜 ， 营 养 ...
2      5        1 4 款 知 性 美 装 ， 时 尚 惊 艳 搁 浅 的 阳 光 轻 熟 的 优 雅
3      9              火 焰 喷 射 器 1 0 0 0 度 火 焰 烧 死 鬼 子 4 连 拍
4     12                            1 8 岁 青 年 砍 死 8 8 岁 老 兵


In [None]:
def data_processing(data, tokenizer, maxlen, batch, training):
  text1, type1, mask1, labe1 = [], [], [], []
  
  for i in range(len(data)):
    text2, type2, mask2 = tokenizer.encoding(data['text'][i], None, maxlen)
    labe2 = data['label'][i]
    text1.append(text2)
    type1.append(type2)
    mask1.append(mask2)
    labe1.append(labe2)
    
  text1, type1, mask1 = np.array(text1), np.array(type1), np.array(mask1)
  data1 = tf.data.Dataset.from_tensor_slices((text1, type1, mask1, np.array(labe1)))
  return data1.shuffle(len(text1)).batch(batch) if training else data1.batch(batch)


toke_1 = mm.Tokenizer()
toke_1.loading(VOCAB)
trai_2 = data_processing(trai_1, toke_1, MAXLEN, BATCH, True)
deve_2 = data_processing(deve_1, toke_1, MAXLEN, BATCH, False)
test_2 = data_processing(test_1, toke_1, MAXLEN, BATCH, False)

In [None]:
class MyModel(keras.Model):
  def __init__(self, model, mode, config, drop, dim, category):
    super(MyModel, self).__init__()
    self.bert = mm.BERT(config, model, mode)
    self.drop = keras.layers.Dropout(drop)
    self.dense1 = keras.layers.Dense(dim, activation='relu')
    self.dense2 = keras.layers.Dense(category, activation='softmax')

  def propagating(self, text, segment, mask, training):
    x1 = self.bert.propagating(text, segment, mask, training)
    x1 = self.drop(x1, training=training)
    return self.dense2(self.dense1(x1))


mode_1 = MyModel(MODEL, MODE, CONFIG, DROP, DIM, CATE)
mode_1.bert.loading(CKPT)
step_1 = EPOCH*(int(len(trai_1)/BATCH)+1)
loss_0 = keras.losses.SparseCategoricalCrossentropy()
opti_1 = mm.AdamWV2(step_1, LRATE)

In [None]:
loss_1 = tf.keras.metrics.Mean(name='training_loss')
accu_1 = tf.keras.metrics.SparseCategoricalAccuracy(name='training_accuracy')
loss_2 = tf.keras.metrics.Mean(name='dev_loss')
accu_2 = tf.keras.metrics.SparseCategoricalAccuracy(name='dev_accuracy')


@tf.function
def step_training(text, segment, mask, y):
  with tf.GradientTape() as tape_1:
    pred_1 = mode_1.propagating(text, segment, mask, True)
    valu_1 = loss_0(y, pred_1)

  grad_1 = tape_1.gradient(valu_1, mode_1.trainable_variables)
  grad_1, _ = tf.clip_by_global_norm(grad_1, 1.0)
  opti_1.apply_gradients(zip(grad_1, mode_1.trainable_variables))
  loss_1(valu_1)
  accu_1(y, pred_1)


@tf.function
def step_evaluating(text, segment, mask, y):
  pred_1 = mode_1.propagating(text, segment, mask, False)
  valu_1 = loss_0(y, pred_1)
  loss_2(valu_1)
  accu_2(y, pred_1)

In [None]:
temp_1 = 'Training loss is {}, accuracy is {}, and step cost is {}.'
temp_2 = 'Dev accuracy is {}, and epoch cost is {}.'
coun_1 = 0

for e_1 in range(EPOCH):
  print('Epoch {} running.'.format(e_1+1))
  time_0 = time.time()

  for x_1, x_2, x_3, y_1 in trai_2:
    time_1, coun_1 = time.time(), coun_1+1
    step_training(x_1, x_2, x_3, y_1)

    if coun_1 % 500 == 0:
      o_1, o_2 = round(float(loss_1.result()), 4), round(float(accu_1.result()), 4)
      print(temp_1.format(o_1, o_2, round(time.time()-time_1, 4)))

  for x_1, x_2, x_3, y_1 in deve_2:
    step_evaluating(x_1, x_2, x_3, y_1)
  
  print(temp_2.format(round(float(accu_2.result()), 4), round(time.time()-time_0, 4)))
  print('**********')
  accu_1.reset_states()
  accu_2.reset_states()

Epoch 1 running.
Training loss is 1.5628, accuracy is 0.578, and step cost is 0.35.
Training loss is 1.1704, accuracy is 0.6805, and step cost is 0.3596.
Training loss is 1.0157, accuracy is 0.7196, and step cost is 0.3738.
Training loss is 0.929, accuracy is 0.741, and step cost is 0.3566.
Dev accuracy is 0.8234, and epoch cost is 936.8971.
**********
Epoch 2 running.
Training loss is 0.8692, accuracy is 0.8494, and step cost is 0.3585.
Training loss is 0.8073, accuracy is 0.8546, and step cost is 0.3557.
Training loss is 0.7605, accuracy is 0.8558, and step cost is 0.3532.
Training loss is 0.7252, accuracy is 0.8567, and step cost is 0.3568.
Training loss is 0.698, accuracy is 0.8571, and step cost is 0.3542.
Dev accuracy is 0.8378, and epoch cost is 922.7776.
**********
Epoch 3 running.
Training loss is 0.6703, accuracy is 0.9068, and step cost is 0.3572.
Training loss is 0.6374, accuracy is 0.9072, and step cost is 0.3549.
Training loss is 0.6096, accuracy is 0.9089, and step cost 

In [None]:
c_1, c_2 = 0, 0

for x_1, x_2, x_3, y_1 in test_2:
  pred_1 = mode_1.propagating(x_1, x_2, x_3, False)
  comp_1 = sum(np.array(y_1)==np.argmax(pred_1, 1))
  c_1, c_2 = c_1+len(pred_1), c_2+comp_1

print('Test accuracy is '+str(round(float(c_2/c_1), 4))+'.')

Test accuracy is 0.8428.
