A text classification example for BERT using Google Colab.  
The data is from https://github.com/FudanNLP/nlpcc2017_news_headline_categorization.


In [0]:
try:
  %tensorflow_version 2.x
except Exception:
  pass

import time
import json
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras
from sklearn import preprocessing
from bert_by_tf2 import BERT, AdamW, Tokenizer

In [0]:
trai_1 = pd.read_table('datasets/a/train.txt', header=None, names=['label', 'text'])
deve_1 = pd.read_table('datasets/a/dev.txt', header=None, names=['label', 'text'])
test_1 = pd.read_table('datasets/a/test.txt', header=None, names=['label', 'text'])
print(test_1.head())

           label                                            text
0           baby                 生完 小孩 ， 公公 伺候 我 坐月子 ， 很 羞涩 很 感动
1        fashion    唐艺昕 与 陈伟霆 为 初秋 的 情侣 做出 了 典范 ， 看 他们 如何 穿 情侣 衣
2  entertainment  同学聚会 美女 被 嘲笑 是 剩女 ， 当超帅 老公 带 着 儿子 出场 ， 全场 沸腾 了
3        finance                             中国 供给 侧 至少 存在 六大 问题
4          world                                2.5 万英镑 可住 戴妃 闺房


In [0]:
labe_1 = preprocessing.LabelEncoder()
trai_1['label'] = labe_1.fit_transform(trai_1['label'])
deve_1['label'] = labe_1.transform(deve_1['label'])
test_1['label'] = labe_1.transform(test_1['label'])

for i in range(18):
    print(i, labe_1.inverse_transform([i])[0])

0 baby
1 car
2 discovery
3 entertainment
4 essay
5 fashion
6 finance
7 food
8 game
9 history
10 military
11 regimen
12 society
13 sports
14 story
15 tech
16 travel
17 world


In [0]:
ALBERT = False
MAXLEN = 40
CATE = 18
DROP = 0.5
DIM = 128
LRATE = 5e-5
BATCH = 64
EPOCH = 3

VOCAB = 'models/bert_base_ch/vocab.txt'
CONFIG = 'models/bert_base_ch/bert_config.json'
CKPT = 'models/bert_base_ch/bert_model.ckpt'

In [0]:
def data_processing(data, tokenizer, maxlen, batch):
    text1, type1, mask1, labe1 = [], [], [], []

    for i in range(len(data)):
        text2, type2, mask2 = tokenizer.encoding(data['text'][i], None, maxlen)
        labe2 = data['label'][i]
        text1.append(text2)
        type1.append(type2)
        mask1.append(mask2)
        labe1.append(labe2)

    text1, type1, mask1, labe1 = np.array(text1), np.array(type1), np.array(mask1), np.array(labe1)
    return tf.data.Dataset.from_tensor_slices((text1, type1, mask1, labe1)).shuffle(len(text1)).batch(batch)

toke_1 = Tokenizer()
toke_1.loading(VOCAB)
trai_2 = data_processing(trai_1, toke_1, MAXLEN, BATCH)
deve_2 = data_processing(deve_1, toke_1, MAXLEN, BATCH)
test_2 = data_processing(test_1, toke_1, MAXLEN, BATCH)

In [0]:
class MyModel(keras.Model):
  def __init__(self, albert, config, drop, dim, category):
    super(MyModel, self).__init__()
    self.bert = BERT(config, albert)
    self.drop = keras.layers.Dropout(drop)
    self.dense1 = keras.layers.Dense(dim, activation='relu')
    self.dense2 = keras.layers.Dense(category, activation='softmax')

  def propagating(self, text, segment, mask, training):
    x1 = self.bert.propagating(text, segment, mask, True, training)
    x1 = self.drop(x1, training=training)
    return self.dense2(self.dense1(x1))

In [0]:
step_1 = EPOCH*(int(len(trai_1)/BATCH)+1)
loss_1 = keras.losses.SparseCategoricalCrossentropy()
opti_1 = AdamW(step_1, LRATE)
mode_1 = MyModel(ALBERT, CONFIG, DROP, DIM, CATE)
mode_1.bert.loading(CKPT)

In [0]:
l_1 = tf.keras.metrics.Mean(name='training_loss')
a_1 = tf.keras.metrics.SparseCategoricalAccuracy(name='training_accuracy')
l_2 = tf.keras.metrics.Mean(name='test_loss')
a_2 = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')
coun_1 = 0

@tf.function
def step_training(text, segment, mask, y):
  with tf.GradientTape() as tape:
    pred1 = mode_1.propagating(text, segment, mask, True)
    loss1 = loss_1(y, pred1)

  grad_1 = tape.gradient(loss1, mode_1.trainable_variables)
  opti_1.apply_gradients(zip(grad_1, mode_1.trainable_variables))
  l_1(loss1)
  a_1(y, pred1)

@tf.function
def step_evaluating(text, segment, mask, y):
  pred1 = mode_1.propagating(text, segment, mask, False)
  loss1 = loss_1(y, pred1)
  l_2(loss1)
  a_2(y, pred1)

In [0]:
temp_1 = 'Epoch {} running, loss is {}, training accuracy is {}, and step cost is {}.'
temp_2 = 'Epoch {} completed, training accuracy is {}, and test accuracy is {}.'

for e_1 in range(EPOCH):
  for x_1, x_2, x_3, y_1 in trai_2:
    time_1, coun_1 = time.time(), coun_1+1
    step_training(x_1, x_2, x_3, y_1)

    if coun_1 % 500 == 0:
        o_1, o_2 = round(float(l_1.result()), 4), round(float(a_1.result()), 4)
        print(temp_1.format(e_1+1, o_1, o_2, round(time.time()-time_1, 4)))

  for x_1, x_2, x_3, y_1 in deve_2:
    step_evaluating(x_1, x_2, x_3, y_1)

  print(temp_2.format(e_1+1, round(float(a_1.result()), 4), round(float(a_2.result()), 4)))
  print('**********')

Epoch 1 running, loss is 1.1703, training accuracy is 0.6733, and step cost is 0.4859.
Epoch 1 running, loss is 0.9623, training accuracy is 0.7297, and step cost is 0.4908.
Epoch 1 running, loss is 0.872, training accuracy is 0.7527, and step cost is 0.5247.
Epoch 1 running, loss is 0.8187, training accuracy is 0.7657, and step cost is 0.488.
Epoch 1 completed, training accuracy is 0.7741, and test accuracy is 0.8208.
**********
Epoch 2 running, loss is 0.7765, training accuracy is 0.7764, and step cost is 0.5035.
Epoch 2 running, loss is 0.7239, training accuracy is 0.7908, and step cost is 0.4829.
Epoch 2 running, loss is 0.686, training accuracy is 0.8009, and step cost is 0.4836.
Epoch 2 running, loss is 0.6572, training accuracy is 0.8087, and step cost is 0.4741.
Epoch 2 running, loss is 0.6338, training accuracy is 0.8152, and step cost is 0.4752.
Epoch 2 completed, training accuracy is 0.8191, and test accuracy is 0.8274.
**********
Epoch 3 running, loss is 0.6111, training ac

In [0]:
c_1, c_2 = 0, 0

for x_1, x_2, x_3, y_1 in test_2:
  pred_1 = mode_1.propagating(x_1, x_2, x_3, False)
  comp_1 = sum(np.array(y_1)==np.argmax(pred_1, 1))
  c_1, c_2 = c_1+len(pred_1), c_2+comp_1

print('Test accuracy is '+str(c_2/c_1)+'.')

Test accuracy is 0.832239951075777.
