A text similarity classification example for ALBERT using Google Colab.  
The data is from LCQMC.


In [0]:
try:
  %tensorflow_version 2.x
except Exception:
  pass

import time
import json
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras
from bert_by_tf2 import BERT, AdamW, Tokenizer

In [0]:
trai_1 = pd.read_csv('datasets/b/train.csv')
deve_1 = pd.read_csv('datasets/b/dev.csv')
test_1 = pd.read_csv('datasets/b/test.csv')
print(test_1.head())

                sentence1      sentence2  label
0               谁有狂三这张高清的       这张高清图，谁有      0
1              英雄联盟什么英雄最好    英雄联盟最好英雄是什么      1
2             这是什么意思，被蹭网吗   我也是醉了，这是什么意思      0
3            现在有什么动画片好看呢？  现在有什么好看的动画片吗？      1
4  请问晶达电子厂现在的工资待遇怎么样要求有哪些  三星电子厂工资待遇怎么样啊      0


In [0]:
ALBERT = True
MAXLEN = 80
CATE = 2
DROP = 0.5
DIM = 128
LRATE = 1e-4
BATCH = 64
EPOCH = 10

VOCAB = 'models/albert_tiny_ch/vocab.txt'
CONFIG = 'models/albert_tiny_ch/albert_config_tiny_g.json'
CKPT = 'models/albert_tiny_ch/albert_model.ckpt'

In [0]:
def data_processing(data, tokenizer, maxlen, batch):
    text1, type1, mask1, labe1 = [], [], [], []

    for i in range(len(data)):
        if len(data['sentence1'][i])+len(data['sentence2'][i]) > maxlen-2:
            continue

        text2, type2, mask2 = tokenizer.encoding(data['sentence1'][i], data['sentence2'][i], maxlen)
        labe2 = data['label'][i]
        text1.append(text2)
        type1.append(type2)
        mask1.append(mask2)
        labe1.append(labe2)

    text1, type1, mask1, labe1 = np.array(text1), np.array(type1), np.array(mask1), np.array(labe1)
    return tf.data.Dataset.from_tensor_slices((text1, type1, mask1, labe1)).shuffle(len(text1)).batch(batch)

toke_1 = Tokenizer()
toke_1.loading(VOCAB)
trai_2 = data_processing(trai_1, toke_1, MAXLEN, BATCH)
deve_2 = data_processing(deve_1, toke_1, MAXLEN, BATCH)
test_2 = data_processing(test_1, toke_1, MAXLEN, BATCH)

In [0]:
class MyModel(keras.Model):
  def __init__(self, albert, config, drop, dim, category):
    super(MyModel, self).__init__()
    self.bert = BERT(config, albert)
    self.drop = keras.layers.Dropout(drop)
    self.dense1 = keras.layers.Dense(dim, activation='relu')
    self.dense2 = keras.layers.Dense(category, activation='softmax')

  def propagating(self, text, segment, mask, training):
    x1 = self.bert.propagating(text, segment, mask, True, training)
    x1 = self.drop(x1, training=training)
    return self.dense2(self.dense1(x1))

In [0]:
step_1 = EPOCH*(int(len(trai_1)/BATCH)+1)
loss_1 = keras.losses.SparseCategoricalCrossentropy()
opti_1 = AdamW(step_1, LRATE)
mode_1 = MyModel(ALBERT, CONFIG, DROP, DIM, CATE)
mode_1.bert.loading(CKPT)

In [0]:
l_1 = tf.keras.metrics.Mean(name='training_loss')
a_1 = tf.keras.metrics.SparseCategoricalAccuracy(name='training_accuracy')
l_2 = tf.keras.metrics.Mean(name='test_loss')
a_2 = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')
coun_1 = 0

@tf.function
def step_training(text, segment, mask, y):
  with tf.GradientTape() as tape:
    pred1 = mode_1.propagating(text, segment, mask, True)
    loss1 = loss_1(y, pred1)

  grad_1 = tape.gradient(loss1, mode_1.trainable_variables)
  opti_1.apply_gradients(zip(grad_1, mode_1.trainable_variables))
  l_1(loss1)
  a_1(y, pred1)

@tf.function
def step_evaluating(text, segment, mask, y):
  pred1 = mode_1.propagating(text, segment, mask, False)
  loss1 = loss_1(y, pred1)
  l_2(loss1)
  a_2(y, pred1)

In [0]:
temp_1 = 'Epoch {} running, loss is {}, training accuracy is {}, and step cost is {}.'
temp_2 = 'Epoch {} completed, training accuracy is {}, and test accuracy is {}.'

for e_1 in range(EPOCH):
  for x_1, x_2, x_3, y_1 in trai_2:
    time_1, coun_1 = time.time(), coun_1+1
    step_training(x_1, x_2, x_3, y_1)

    if coun_1 % 1000 == 0:
        o_1, o_2 = round(float(l_1.result()), 4), round(float(a_1.result()), 4)
        print(temp_1.format(e_1+1, o_1, o_2, round(time.time()-time_1, 4)))

  for x_1, x_2, x_3, y_1 in deve_2:
    step_evaluating(x_1, x_2, x_3, y_1)

  print(temp_2.format(e_1+1, round(float(a_1.result()), 4), round(float(a_2.result()), 4)))
  print('**********')

Epoch 1 running, loss is 0.356, training accuracy is 0.843, and step cost is 0.114.
Epoch 1 running, loss is 0.31, training accuracy is 0.8656, and step cost is 0.1179.
Epoch 1 running, loss is 0.2907, training accuracy is 0.8748, and step cost is 0.1159.
Epoch 1 completed, training accuracy is 0.8794, and test accuracy is 0.8094.
**********
Epoch 2 running, loss is 0.2765, training accuracy is 0.8815, and step cost is 0.1194.
Epoch 2 running, loss is 0.2661, training accuracy is 0.8865, and step cost is 0.1163.
Epoch 2 running, loss is 0.2589, training accuracy is 0.8899, and step cost is 0.1197.
Epoch 2 running, loss is 0.2537, training accuracy is 0.8924, and step cost is 0.1158.
Epoch 2 completed, training accuracy is 0.8935, and test accuracy is 0.8099.
**********
Epoch 3 running, loss is 0.247, training accuracy is 0.8956, and step cost is 0.1194.
Epoch 3 running, loss is 0.2403, training accuracy is 0.8988, and step cost is 0.117.
Epoch 3 running, loss is 0.2355, training accura

In [0]:
c_1, c_2 = 0, 0

for x_1, x_2, x_3, y_1 in deve_2:
  pred_1 = mode_1.propagating(x_1, x_2, x_3, False)
  comp_1 = sum(np.array(y_1)==np.argmax(pred_1, 1))
  c_1, c_2 = c_1+len(pred_1), c_2+comp_1

print('Validation accuracy is '+str(c_2/c_1)+'.')

Validation accuracy is 0.8437855032947057.


In [0]:
c_1, c_2 = 0, 0

for x_1, x_2, x_3, y_1 in test_2:
  pred_1 = mode_1.propagating(x_1, x_2, x_3, False)
  comp_1 = sum(np.array(y_1)==np.argmax(pred_1, 1))
  c_1, c_2 = c_1+len(pred_1), c_2+comp_1

print('Test accuracy is '+str(c_2/c_1)+'.')

Test accuracy is 0.84408.
