In [1]:
# 查看GPU
!nvidia-smi
!pip install bert4keras

Sun May 30 13:29:37 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P8    28W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import os
os.chdir("/content/drive/MyDrive")
os.environ['TF_KERAS'] = '1'

import numpy as np
from bert4keras.backend import keras, set_gelu
from bert4keras.tokenizers import Tokenizer
from bert4keras.models import build_transformer_model
from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr
from bert4keras.snippets import sequence_padding, DataGenerator
from bert4keras.snippets import open
from keras.layers import Lambda, Dense

import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time


def cal_scores(model, tokenizer, test_file_path, maxlen=None):
    test_data_df = pd.read_csv(test_file_path, header=None, sep="\t", names=["tags", "content"])

    test_data_df = pd.read_csv(test_file_path, header=None, sep="\t", names=["tag", "content"])
    y_true = test_data_df["tag"].tolist()

    all_token_ids, all_segment_ids = [], []
    for _, row in test_data_df.iterrows():
        token_ids, segment_ids = tokenizer.encode(row["content"], maxlen=maxlen)
        all_token_ids.append(token_ids)
        all_segment_ids.append(segment_ids)

    all_token_ids = sequence_padding(all_token_ids)
    all_segment_ids = sequence_padding(all_segment_ids)

    start = time.time()
    y_pred = model.predict([all_token_ids, all_segment_ids]).argmax(axis=1)
    total_time = time.time() - start

    acc = accuracy_score(y_true, y_pred)
    # micro scores
    prec_micro = precision_score(y_true, y_pred, average='micro')
    recall_micro = recall_score(y_true, y_pred, average='micro')
    f1_micro = f1_score(y_true, y_pred, average='micro')
    # macro scores
    prec_macro = precision_score(y_true, y_pred, average='macro')
    recall_macro = recall_score(y_true, y_pred, average='macro')
    f1_macro = f1_score(y_true, y_pred, average='macro')

    print("Accuracy = %f" % acc)
    if len(set(y_true)) <= 2:
        print("Micro Precision = %f" % prec_micro)
        print("Micro Recall = %f" % recall_micro)
        print("Micro F1 = %f" % f1_micro)
    else:
        print("Macro Precision = %f" % prec_macro)
        print("Macro Recall = %f" % recall_macro)
        print("Macro F1 = %f" % f1_macro)
    print("Total prediction time is %f" % total_time)

    return acc, prec_micro, recall_micro, f1_micro, prec_macro, recall_macro, f1_macro, total_time


def load_data(filename):
    """加载数据
    单条格式：(文本, 标签id)
    """
    D = []
    with open(filename, encoding='utf-8') as f:
        for l in f:
            label, text = l.strip().split('\t')
            D.append((text, int(label)))
    return D


def build_bert(model_type, num_classes, epochs, maxlen, learning_rate, batch_sizes, pre_config, data_path):
    ################################ 设置模型参数 ################################
    set_gelu('tanh')  # 切换gelu版本

    train_data = load_data(data_path[0])
    valid_data = load_data(data_path[1])
    test_data = load_data(data_path[2])

    # 建立分词器
    tokenizer = Tokenizer(pre_config[2], do_lower_case=True)

    class data_generator(DataGenerator):
        """数据生成器
        """
        def __iter__(self, random=False):
            batch_token_ids, batch_segment_ids, batch_labels = [], [], []
            for is_end, (text, label) in self.sample(random):
                token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
                batch_token_ids.append(token_ids)
                batch_segment_ids.append(segment_ids)
                batch_labels.append([label])
                if len(batch_token_ids) == self.batch_size or is_end:
                    batch_token_ids = sequence_padding(batch_token_ids)
                    batch_segment_ids = sequence_padding(batch_segment_ids)
                    batch_labels = sequence_padding(batch_labels)
                    yield [batch_token_ids, batch_segment_ids], batch_labels
                    batch_token_ids, batch_segment_ids, batch_labels = [], [], []

    # 转换数据集
    train_generator = data_generator(train_data, batch_sizes[0])
    valid_generator = data_generator(valid_data, batch_sizes[1])
    test_generator = data_generator(test_data, batch_sizes[2])
    
    ################################ 加载预训练模型 ################################
    bert = build_transformer_model(
        config_path=pre_config[0],
        checkpoint_path=pre_config[1],
        model= model_type,
        return_keras_model=False,
    )

    output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output)
    output = Dense(
        units=num_classes,
        activation='softmax',
        kernel_initializer=bert.initializer
    )(output)

    model = keras.models.Model(bert.model.input, output)
    model.summary() # 显示模型结构

    ################################ 训练模型 ################################
    
    # 派生为带分段线性学习率的优化器。
    # 其中name参数可选，但最好填入，以区分不同的派生优化器。
    AdamLR = extend_with_piecewise_linear_lr(Adam, name='AdamLR')

    model.compile(
        loss='sparse_categorical_crossentropy',
        # optimizer=Adam(1e-5),  # 用足够小的学习率
        optimizer=AdamLR(learning_rate=learning_rate, lr_schedule={
            1000: 1,
            2000: 0.1
        }),
        metrics=['accuracy'],
    )

    def evaluate(data):
        total, right = 0., 0.
        for x_true, y_true in data:
            y_pred = model.predict(x_true).argmax(axis=1)
            y_true = y_true[:, 0]
            total += len(y_true)
            right += (y_true == y_pred).sum()
        return right / total


    class Evaluator(keras.callbacks.Callback):
        """评估与保存
        """
        def __init__(self):
            self.best_val_acc = 0.

        def on_epoch_end(self, epoch, logs=None):
            val_acc = evaluate(valid_generator)
            if val_acc > self.best_val_acc:
                self.best_val_acc = val_acc
                model.save_weights(data_path[3])
            test_acc = evaluate(test_generator)
            print(u'val_acc: %.5f, best_val_acc: %.5f, test_acc: %.5f\n' % (val_acc, self.best_val_acc, test_acc))
            # print(u'val_acc: %.5f, best_val_acc: %.5f\n' % (val_acc, self.best_val_acc))
            
    evaluator = Evaluator()

    # 训练模型
    model.fit(
        train_generator.forfit(),
        steps_per_epoch=len(train_generator),
        epochs=epochs,
        callbacks=[evaluator]
    )

    model.load_weights(data_path[3])
    # print(u'final test acc: %05f\n' % (evaluate(test_generator)))
    return model, tokenizer


    # def fit(self, train_filepath, valid_filepath, temp_save_path, 
    #         maxlen=128, learning_rate=1e-4, epochs=5, batch_size=32):
    #     train_data = load_data(train_filepath)
    #     train_generator = data_generator(train_data, batch_size, self.tokenizer)

    #     callbacks = None
    #     if valid_filepath != "" and valid_filepath is not None \
    #             and temp_save_path != "" and temp_save_path is not None:
    #         valid_data = load_data(valid_filepath)
    #         valid_generator = data_generator(valid_data, batch_size, self.tokenizer)
    #         evaluator = Evaluator(self.model, valid_generator, temp_save_path)
    #         callbacks = [evaluator]
        
    #     AdamLR = extend_with_piecewise_linear_lr(Adam, name='AdamLR')

    #     self.model.compile(
    #         loss='sparse_categorical_crossentropy',
    #         optimizer=AdamLR(learning_rate=learning_rate, lr_schedule={
    #             1000: 1,
    #             2000: 0.1
    #         }),
    #         metrics=['accuracy'],
    #     )

    #     self.model.fit(
    #         train_generator.forfit(),
    #         steps_per_epoch=len(train_generator),
    #         epochs=epochs,
    #         callbacks=callbacks
    #     )

    #     if callbacks is not None:
    #         self.model.load_weights(temp_save_path)

    # def __init__(self, model_type, model_para_paths, label_filepath, origin):
    #     # 属性赋值
    #     self.model_type = model_type
    #     self.model_para_paths = model_para_paths
    #     self.label_filepath = label_filepath

    #     # 加载编号-标签字典
    #     with open(label_filepath, "r") as fin:
    #         reader = csv.reader(fin)
    #         self.label_dict = {int(row[0]):row[1] for row in reader}
        
    #     # 创建分词器
    #     tokenizer = Tokenizer(model_para_paths[2], do_lower_case=True)

    #     if origin: # 表示构建一个还未经过微调的模型
    #         # 模型的上游
    #         bert = build_transformer_model(
    #             config_path=model_para_paths[0],
    #             checkpoint_path=model_para_paths[1],
    #             model= model_type,
    #             return_keras_model=False,
    #         )
    #         # 取[CLS]这个token的输出向量作为下游任务的输入
    #         output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output)
    #         # 模型的下游
    #         output = Dense(
    #             units=num_classes,
    #             activation='softmax',
    #             kernel_initializer=bert.initializer
    #         )(output)
    #         # 连接模型的输入与输出
    #         self.model = keras.models.Model(bert.model.input, output)
    #     else: # 表示模型已经过微调
    #         self.model = build_transformer_model(
    #             config_path=model_para_paths[0],
    #             checkpoint_path=model_para_paths[1],
    #             model= model_type,
    #             return_keras_model=False,
    #         )
        
    #     self.model.summary() # 显示模型结构


In [3]:
num_classes = 6
epochs = 10
maxlen = 256
learning_rate = 1e-4
batch_sizes = [32, 2048, 8192]
n = 3


def test_model(model_config, data_path, model_type):
    total_acc = 0
    total_prec_macro = 0
    total_recall_macro = 0
    total_f1_macro = 0
    total_time = 0
    for i in range(n):
        print("####################################### ROUND %d #######################################" % (i + 1))
        model, tokenizer = build_bert(model_type, num_classes, epochs, maxlen, learning_rate, 
                                      batch_sizes, model_config, data_path)
        # acc, _, _, _, prec_macro, recall_macro, f1_macro, cost_time = cal_scores(model, tokenizer, data_path[2], maxlen)
        acc, _, _, _, prec_macro, recall_macro, f1_macro, cost_time = cal_scores(model, tokenizer, data_path[2])
        print("")

        total_acc += acc
        total_prec_macro += prec_macro
        total_recall_macro += recall_macro
        total_f1_macro += f1_macro
        total_time += cost_time

    print("####################################### Final Resutl #######################################")
    print("Average Accuracy = %f" % (total_acc / n))
    print("Average Macro Precision = %f" % (total_prec_macro / n))
    print("Average Macro Recall = %f" % (total_recall_macro / n))
    print("Average Macro F1 = %f" % (total_f1_macro / n))
    print("Average Total prediction time is %f" % (total_time / n))

In [None]:
import sys
from bert4keras.tokenizers import load_vocab

dict_path = 'albert_small_zh_google/vocab.txt'

token_dict, keep_tokens = load_vocab(
    dict_path=dict_path,
    simplified=True,
    startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
print(tokenizer._vocab_size)
print(tokenizer._token_dict["[UNK]"])

test_str = "I LOVE YOU"
test_str2 = "我愛你"
print(tokenizer.tokenize(test_str2))
tids, sids = tokenizer.encode(test_str2)
print(tids)
print(tokenizer._vocab_size)
print(sys.getsizeof(tokenizer._token_dict) / 1024)

print("+++++++++++++++++++++++++++++++")
tokenizer2 = Tokenizer(dict_path, do_lower_case=True)
print(tokenizer2._vocab_size)

test_str = "I LOVE YOU"
test_str2 = "我愛你"
print(tokenizer2.tokenize(test_str2))
tids, sids = tokenizer2.encode(test_str2)
print(tids)
print(tokenizer2._vocab_size)
print(sys.getsizeof(tokenizer2._token_dict) / 1024)

13584
1
['[CLS]', '我', '愛', '你', '[SEP]']
[2, 2667, 2593, 770, 3]
13584
576.109375
+++++++++++++++++++++++++++++++
21128
['[CLS]', '我', '愛', '你', '[SEP]']
[101, 2769, 2695, 872, 102]
21128
576.109375


# SMP2020 without emoji

### ALBERT-Small

In [None]:
data_path = ['SMP2020/no_emoji/train.csv',
             'SMP2020/no_emoji/valid.csv', 
             'SMP2020/no_emoji/test.csv',
             'SMP2020/no_emoji/saved_model/best_model.weights']
             
pre_config = ['albert_small_zh_google/albert_config_small_google.json', 
              'albert_small_zh_google/albert_model.ckpt',
              'albert_small_zh_google/vocab.txt']

print("maxlen = %d, learning-rate = %f, batch-size = %d" %(maxlen, learning_rate, batch_sizes[0]))
test_model(pre_config, data_path, "albert")

maxlen = 256, learning-rate = 0.000100, batch-size = 32
####################################### ROUND 1 #######################################
Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        [(None, None)]       0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      [(None, None)]       0                                            
__________________________________________________________________________________________________
Embedding-Token (Embedding)     (None, None, 128)    2704384     Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, None, 12

KeyboardInterrupt: ignored

### ALBERT-Base

In [None]:
pre_config = ['albert_base_zh_google/albert_config.json', 
              'albert_base_zh_google/model.ckpt-best',
              'albert_base_zh_google/vocab_chinese.txt']

print("maxlen = %d, learning-rate = %f, batch-size = %d" %(maxlen, learning_rate, batch_sizes[0]))
test_model(pre_config, data_path, "albert")

maxlen = 256, learning-rate = 0.000100, batch-size = 32
####################################### ROUND 1 #######################################
Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        [(None, None)]       0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      [(None, None)]       0                                            
__________________________________________________________________________________________________
Embedding-Token (Embedding)     (None, None, 128)    2704384     Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, None, 12

KeyboardInterrupt: ignored

### BERT-Base

In [None]:
pre_config = ['bert_base_zh_google/bert_config.json', 
              'bert_base_zh_google/bert_model.ckpt',
              'bert_base_zh_google/vocab.txt']

print("maxlen = %d, learning-rate = %f, batch-size = %d" % (maxlen, learning_rate, batch_sizes[0]))
test_model(pre_config, data_path, "bert")

maxlen = 256, learning-rate = 0.000100, batch-size = 32
####################################### ROUND 1 #######################################
Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        [(None, None)]       0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      [(None, None)]       0                                            
__________________________________________________________________________________________________
Embedding-Token (Embedding)     (None, None, 768)    16226304    Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, None, 76

KeyboardInterrupt: ignored

# SMP2020 with emoji

### ALBERT-Small

In [4]:
data_path = ['SMP2020/with_emoji/train.csv',
             'SMP2020/with_emoji/valid.csv', 
             'SMP2020/with_emoji/test.csv',
             'SMP2020/with_emoji/saved_model/best_model.weights']

In [7]:
pre_config = ['albert_small_zh_google/albert_config_small_google.json', 
              'albert_small_zh_google/albert_model.ckpt',
              'albert_small_zh_google/vocab.txt']

print("maxlen = %d, learning-rate = %f, batch-size = %d" %(maxlen, learning_rate, batch_sizes[0]))
test_model(pre_config, data_path, "albert")

maxlen = 256, learning-rate = 0.000100, batch-size = 32
####################################### ROUND 1 #######################################
Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        [(None, None)]       0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      [(None, None)]       0                                            
__________________________________________________________________________________________________
Embedding-Token (Embedding)     (None, None, 128)    2704384     Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, None, 12

KeyboardInterrupt: ignored

### ALBERT-Base

In [6]:
pre_config = ['albert_base_zh_google/albert_config.json', 
              'albert_base_zh_google/model.ckpt-best',
              'albert_base_zh_google/vocab_chinese.txt']

print("maxlen = %d, learning-rate = %f, batch-size = %d" %(maxlen, learning_rate, batch_sizes[0]))
test_model(pre_config, data_path, "albert")

maxlen = 256, learning-rate = 0.000100, batch-size = 32
####################################### ROUND 1 #######################################
Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        [(None, None)]       0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      [(None, None)]       0                                            
__________________________________________________________________________________________________
Embedding-Token (Embedding)     (None, None, 128)    2704384     Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, None, 12

KeyboardInterrupt: ignored

### BERT-Base

In [8]:
data_path = ['SMP2020/with_emoji/train.csv',
             'SMP2020/with_emoji/valid.csv', 
             'SMP2020/with_emoji/test.csv',
             'SMP2020/with_emoji/saved_model/best_model.weights']
             
pre_config = ['bert_base_zh_google/bert_config.json', 
              'bert_base_zh_google/bert_model.ckpt',
              'bert_base_zh_google/vocab.txt']

print("maxlen = %d, learning-rate = %f, batch-size = %d" % (maxlen, learning_rate, batch_sizes[0]))
test_model(pre_config, data_path, "bert")

maxlen = 256, learning-rate = 0.000100, batch-size = 32
####################################### ROUND 1 #######################################
Model: "model_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        [(None, None)]       0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      [(None, None)]       0                                            
__________________________________________________________________________________________________
Embedding-Token (Embedding)     (None, None, 768)    16226304    Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, None, 76

KeyboardInterrupt: ignored