A classification example for BERT using Google Colab.  
The data is from https://github.com/FudanNLP/nlpcc2017_news_headline_categorization.


In [0]:
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}
!mkdir -p drive
!google-drive-ocamlfuse drive

In [0]:
import os
import warnings
os.chdir('drive/python/project01/bert')
warnings.filterwarnings('ignore')

In [0]:
try:
  %tensorflow_version 2.x
except Exception:
  pass

import time
import json
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras
from sklearn import preprocessing
from bert_by_tf2 import BERT, DecayingADAM, Tokenizer

In [4]:
trai_1 = pd.read_table('datasets/a/train.txt', header=None, names=['label', 'text'])
deve_1 = pd.read_table('datasets/a/dev.txt', header=None, names=['label', 'text'])
test_1 = pd.read_table('datasets/a/test.txt', header=None, names=['label', 'text'])
print(test_1)

               label                                            text
0               baby                 生完 小孩 ， 公公 伺候 我 坐月子 ， 很 羞涩 很 感动
1            fashion    唐艺昕 与 陈伟霆 为 初秋 的 情侣 做出 了 典范 ， 看 他们 如何 穿 情侣 衣
2      entertainment  同学聚会 美女 被 嘲笑 是 剩女 ， 当超帅 老公 带 着 儿子 出场 ， 全场 沸腾 了
3            finance                             中国 供给 侧 至少 存在 六大 问题
4              world                                2.5 万英镑 可住 戴妃 闺房
...              ...                                             ...
35969         travel               白俄罗斯 小镇 美女 愁 嫁 ， 年轻 姑娘 都 想 嫁 到 中国
35970          world                 香港市民 游行 抗议 安倍 企图 修宪吁 警惕 军国主义 复辟
35971        finance                                 网贷 诈骗 平台 的 狐狸尾巴
35972         sports                 巅峰 雷阿伦 单挑 如今 的 库里 ， 谁 的 赢面 更大 ？
35973        society               “ 学生家长 ” 到 培训班 咨询 顺走 老师 现金 、 手机 ？

[35974 rows x 2 columns]


In [5]:
labe_1 = preprocessing.LabelEncoder()
trai_1['label'] = labe_1.fit_transform(trai_1['label'])
deve_1['label'] = labe_1.transform(deve_1['label'])
test_1['label'] = labe_1.transform(test_1['label'])

for i in range(18):
    print(i, labe_1.inverse_transform([i])[0])

0 baby
1 car
2 discovery
3 entertainment
4 essay
5 fashion
6 finance
7 food
8 game
9 history
10 military
11 regimen
12 society
13 sports
14 story
15 tech
16 travel
17 world


In [0]:
MAXLEN = 40
CATE = 18
DROP = 0.5
DIM = 128
LRATE = 5e-5
BATCH = 64
EPOCH = 3

In [0]:
def data_processing(data, tokenizer, maxlen, batch):
    text1, type1, mask1, labe1 = [], [], [], []

    for i in range(len(data)):
        text2, type2, mask2 = tokenizer.encoding(data['text'][i], None, maxlen)
        labe2 = data['label'][i]
        text1.append(text2)
        type1.append(type2)
        mask1.append(mask2)
        labe1.append(labe2)

    text1, type1, mask1, labe1 = np.array(text1), np.array(type1), np.array(mask1), np.array(labe1)
    return tf.data.Dataset.from_tensor_slices((text1, type1, mask1, labe1)).shuffle(len(text1)).batch(batch)

toke_1 = Tokenizer()
toke_1.loading('bert/models/bert_base_ch/vocab.txt')
trai_2 = data_processing(trai_1, toke_1, MAXLEN, BATCH)
deve_2 = data_processing(deve_1, toke_1, MAXLEN, BATCH)
test_2 = data_processing(test_1, toke_1, MAXLEN, BATCH)
step_1 = EPOCH*(int(len(trai_1)/BATCH)+1)

In [0]:
class MyModel(keras.Model):
  def __init__(self, config, drop, dim, category):
    super(MyModel, self).__init__()
    self.bert = BERT(config)
    self.drop = keras.layers.Dropout(drop)
    self.dense1 = keras.layers.Dense(dim, activation='relu')
    self.dense2 = keras.layers.Dense(category, activation='softmax')

  def propagating(self, text, segment, mask, training):
    x1 = self.bert.propagating(text, segment, mask, True, training)
    x1 = self.drop(x1, training=training)
    return self.dense2(self.dense1(x1))

In [0]:
loss_1 = keras.losses.SparseCategoricalCrossentropy()
opti_1 = DecayingADAM(step_1, LRATE)
mode_1 = MyModel('bert/models/bert_base_ch/bert_config.json', DROP, DIM, CATE)
mode_1.bert.loading('bert/models/bert_base_ch/bert_model.ckpt')

In [0]:
l_1 = tf.keras.metrics.Mean(name='training_loss')
a_1 = tf.keras.metrics.SparseCategoricalAccuracy(name='training_accuracy')
l_2 = tf.keras.metrics.Mean(name='test_loss')
a_2 = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')
coun_1 = 0

@tf.function
def step_training(text, segment, mask, y):
  with tf.GradientTape() as tape:
    pred1 = mode_1.propagating(text, segment, mask, True)
    loss1 = loss_1(y, pred1)

  grad_1 = tape.gradient(loss1, mode_1.trainable_variables)
  opti_1.apply_gradients(zip(grad_1, mode_1.trainable_variables))
  l_1(loss1)
  a_1(y, pred1)

@tf.function
def step_evaluating(text, segment, mask, y):
  pred1 = mode_1.propagating(text, segment, mask, False)
  loss1 = loss_1(y, pred1)
  l_2(loss1)
  a_2(y, pred1)

In [11]:
temp_1 = 'Epoch {} running, training loss is {}, and training accuracy is {}, and step cost is {}.'
temp_2 = 'Epoch {} completed, training loss is {}, training accuracy is {}, test loss is {}, and test accuracy is {}.'

for e_1 in range(EPOCH):
  for x_1, x_2, x_3, y_1 in trai_2:
    time_1 = time.time()
    step_training(x_1, x_2, x_3, y_1)
    coun_1 = coun_1+1

    if coun_1 % 100 == 0:
        print(temp_1.format(e_1+1, l_1.result(), a_1.result(), time.time()-time_1))

  for x_1, x_2, x_3, y_1 in deve_2:
    step_evaluating(x_1, x_2, x_3, y_1)

  print(temp_2.format(e_1+1, l_1.result(), a_1.result(), l_2.result(), a_2.result()))
  print('**********')

Epoch 1 running, training loss is 2.32075572013855, and training accuracy is 0.34312498569488525, and step cost is 0.6965832710266113.
Epoch 1 running, training loss is 1.6398135423660278, and training accuracy is 0.5415624976158142, and step cost is 0.7103571891784668.
Epoch 1 running, training loss is 1.3880748748779297, and training accuracy is 0.6152083277702332, and step cost is 0.7120764255523682.
Epoch 1 running, training loss is 1.249719500541687, and training accuracy is 0.6538671851158142, and step cost is 0.674846887588501.
Epoch 1 running, training loss is 1.165598750114441, and training accuracy is 0.6775312423706055, and step cost is 0.6955010890960693.
Epoch 1 running, training loss is 1.1035981178283691, and training accuracy is 0.6946094036102295, and step cost is 0.6796801090240479.
Epoch 1 running, training loss is 1.0543708801269531, and training accuracy is 0.7075892686843872, and step cost is 0.709733247756958.
Epoch 1 running, training loss is 1.021166205406189, 

In [12]:
c_1, c_2 = 0, 0

for x_1, x_2, x_3, y_1 in test_2:
  pred_1 = mode_1.propagating(x_1, x_2, x_3, False)
  comp_1 = sum(np.array(y_1)==np.argmax(pred_1, 1))
  c_1, c_2 = c_1+len(pred_1), c_2+comp_1

print('Test accuracy is '+str(c_2/c_1)+'.')

Test accuracy is 0.8345749708122533.
