In [47]:
import os
import math
import datetime
from tqdm import tqdm
import pandas as pd
import numpy as np
import tensorflow as tf

使用GPU

In [48]:
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
tf.__version__

'2.1.0'

导入[bert-for-tf2]包使用bert，以及一些相关的加载预训练权重、tokenizing的包

In [49]:
import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from bert.tokenization.bert_tokenization import FullTokenizer
from tensorflow import keras
import os
import re

to prepare/encode 
the data for feeding into our BERT model, by:
  - tokenizing the text
  - trim or pad it to a `max_seq_len` length
  - append the special tokens `[CLS]` and `[SEP]`
  - convert the string tokens to numerical `ID`s using the original model's token encoding from `vocab.txt`

In [51]:
import fm
file_dir = '/home/tyx/data/split/usual/'

In [52]:
a = fm.load_file(file_dir + 'train.txt')

In [53]:
len(a)

22213

In [54]:
import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
# from bert.tokenization import FullTokenizer
from bert import bert_tokenization

class data_loader:
    def __init__(self, data_dir,tokenizer: bert_tokenization.FullTokenizer, sample_size=None, max_seq_len=1024):
        self.tokenizer = tokenizer
        self.sample_size = sample_size
        self.max_seq_len = 0
        train = fm.load_file(data_dir + 'train.txt')
        test = fm.load_file(data_dir + 'dev.txt')
        if sample_size is not None:
            assert sample_size % 128 == 0
            train, test = train[:sample_size], test[:sample_size]
        
        ((self.train_x, self.train_y),
         (self.test_x, self.test_y)) = map(self._prepare, [train, test])

        print("max seq_len", self.max_seq_len)
        self.max_seq_len = min(self.max_seq_len, max_seq_len)
        ((self.train_x, self.train_x_token_types),
         (self.test_x, self.test_x_token_types)) = map(self._pad, 
                                                       [self.train_x, self.test_x])

    def _prepare(self, lst):
        x, y = [], []
        for line in lst:
            text, label = line.split('\t');
            tokens = self.tokenizer.tokenize(text)
            tokens = ["[CLS]"] + tokens + ["[SEP]"]
            token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
            self.max_seq_len = max(self.max_seq_len, len(token_ids))
            x.append(token_ids)
            y.append(int(label))
        return np.array(x), np.array(y)

    def _pad(self, ids):
        x, t = [], []
        token_type_ids = [0] * self.max_seq_len
        for input_ids in ids:
            input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]
            input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
            x.append(np.array(input_ids))
            t.append(token_type_ids)
        return np.array(x), np.array(t)

## A tweak

Because of a `tf.train.load_checkpoint` limitation requiring list permissions on the google storage bucket, we need to copy the pre-trained BERT weights locally.

In [55]:
bert_ckpt_dir="../BERT/chinese_L-12_H-768_A-12/"
bert_ckpt_file = bert_ckpt_dir + "bert_model.ckpt"
bert_config_file = bert_ckpt_dir + "bert_config.json"
bert_model_dir="../BERT"
bert_model_name="chinese_L-12_H-768_A-12"

In [56]:
%%time



!mkdir -p .model .model/$bert_model_name

for fname in ["bert_config.json", "vocab.txt", "bert_model.ckpt.meta", "bert_model.ckpt.index", "bert_model.ckpt.data-00000-of-00001"]:
  cmd = f"cp {bert_model_dir}/{bert_model_name}/{fname} .model/{bert_model_name}"
  !$cmd
!ls -la .model .model/$bert_model_name

.model:
total 12
drwxrwxr-x. 3 tyx tyx 4096 Aug  2 16:59 .
drwxrwxr-x. 6 tyx tyx 4096 Aug  5 15:57 ..
drwxrwxr-x. 2 tyx tyx 4096 Aug  2 17:05 chinese_L-12_H-768_A-12

.model/chinese_L-12_H-768_A-12:
total 402908
drwxrwxr-x. 2 tyx tyx      4096 Aug  2 17:05 .
drwxrwxr-x. 3 tyx tyx      4096 Aug  2 16:59 ..
-rw-r--r--. 1 tyx tyx       520 Aug  9 15:55 bert_config.json
-rw-r--r--. 1 tyx tyx 411529768 Aug  9 15:55 bert_model.ckpt.data-00000-of-00001
-rw-r--r--. 1 tyx tyx      8512 Aug  9 15:55 bert_model.ckpt.index
-rw-r--r--. 1 tyx tyx    905069 Aug  9 15:55 bert_model.ckpt.meta
-rw-r--r--. 1 tyx tyx    109540 Aug  9 15:55 vocab.txt
CPU times: user 149 ms, sys: 1.3 s, total: 1.44 s
Wall time: 9.96 s


In [57]:
bert_ckpt_dir    = os.path.join(".model/",bert_model_name)
bert_ckpt_file   = os.path.join(bert_ckpt_dir, "bert_model.ckpt")
bert_config_file = os.path.join(bert_ckpt_dir, "bert_config.json")

# Preparing the Data

Now let's fetch and prepare the data by taking the first `max_seq_len` tokenens after tokenizing with the BERT tokenizer, und use `sample_size` examples for both training and testing.

To keep training fast, we'll take a sample of about 2500 train and test examples, respectively, and use the first 128 tokens only (transformers memory and computation requirements scale quadraticly with the sequence length - so with a TPU you might use `max_seq_len=512`, but on a GPU this would be too slow, and you will have to use a very small `batch_size`s to fit the model into the GPU memory).

In [58]:
%%time

tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt"))
data = data_loader(file_dir,tokenizer, 
                       sample_size=None,#5000, 
                       max_seq_len=200)

max seq_len 192
CPU times: user 15.9 s, sys: 132 ms, total: 16 s
Wall time: 16.1 s


In [59]:
print("            train_x", data.train_x.shape)
print("train_x_token_types", data.train_x_token_types.shape)
print("            train_y", data.train_y.shape)
print("             test_x", data.test_x.shape)
print("        max_seq_len", data.max_seq_len)

            train_x (22213, 192)
train_x_token_types (22213, 192)
            train_y (22213,)
             test_x (5553, 192)
        max_seq_len 192


## Adapter BERT

If we decide to use [adapter-BERT](https://arxiv.org/abs/1902.00751) we need some helpers for freezing the original BERT layers.

In [60]:

def flatten_layers(root_layer):
    if isinstance(root_layer, keras.layers.Layer):
        yield root_layer
    for layer in root_layer._layers:
        for sub_layer in flatten_layers(layer):
            yield sub_layer


def freeze_bert_layers(l_bert):
    """
    Freezes all but LayerNorm and adapter layers - see arXiv:1902.00751.
    """
    for layer in flatten_layers(l_bert):
        if layer.name in ["LayerNorm", "adapter-down", "adapter-up"]:
            layer.trainable = True
        elif len(layer._layers) == 0:
            layer.trainable = False
        l_bert.embeddings_layer.trainable = False


def create_learning_rate_scheduler(max_learn_rate=5e-5,
                                   end_learn_rate=1e-7,
                                   warmup_epoch_count=10,
                                   total_epoch_count=90):

    def lr_scheduler(epoch):
        if epoch < warmup_epoch_count:
            res = (max_learn_rate/warmup_epoch_count) * (epoch + 1)
        else:
            res = max_learn_rate*math.exp(math.log(end_learn_rate/max_learn_rate)*(epoch-warmup_epoch_count+1)/(total_epoch_count-warmup_epoch_count+1))
        return float(res)
    learning_rate_scheduler = tf.keras.callbacks.LearningRateScheduler(lr_scheduler, verbose=1)

    return learning_rate_scheduler


#Creating a model

Now let's create a classification model using [adapter-BERT](https//arxiv.org/abs/1902.00751), which is clever way of reducing the trainable parameter count, by freezing the original BERT weights, and adapting them with two FFN bottlenecks (i.e. `adapter_size` bellow) in every BERT layer.

**N.B.** The commented out code below show how to feed a `token_type_ids`/`segment_ids` sequence (which is not needed in our case).

In [61]:
def create_model(max_seq_len, adapter_size=64):
    """Creates a classification model."""

    #adapter_size = 64  # see - arXiv:1902.00751

    # create the bert layer
    with tf.io.gfile.GFile(bert_config_file, "r") as reader:
        bc = StockBertConfig.from_json_string(reader.read())
        bert_params = map_stock_config_to_params(bc)
        bert_params.adapter_size = adapter_size
        bert = BertModelLayer.from_params(bert_params, name="bert")

    input_ids      = keras.layers.Input(shape=(max_seq_len,), dtype='int32', name="input_ids")
    # token_type_ids = keras.layers.Input(shape=(max_seq_len,), dtype='int32', name="token_type_ids")
    # output         = bert([input_ids, token_type_ids])
    output         = bert(input_ids)

    print("bert shape", output.shape)
    cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(output)
    cls_out = keras.layers.Dropout(0.5)(cls_out)
    logits = keras.layers.Dense(units=768, activation="tanh")(cls_out)
    logits = keras.layers.Dropout(0.5)(logits)
    logits = keras.layers.Dense(units=6, activation="softmax")(logits)

    # model = keras.Model(inputs=[input_ids, token_type_ids], outputs=logits)
    # model.build(input_shape=[(None, max_seq_len), (None, max_seq_len)])
    model = keras.Model(inputs=input_ids, outputs=logits)
    model.build(input_shape=(None, max_seq_len))

    # load the pre-trained model weights
    load_stock_weights(bert, bert_ckpt_file)

    # freeze weights if adapter-BERT is used
    if adapter_size is not None:
        freeze_bert_layers(bert)

    model.compile(optimizer=keras.optimizers.Adam(),
                loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")])

    model.summary()

    return model


In [62]:
adapter_size = 1 # use None to fine-tune all of BERT
model = create_model(data.max_seq_len, adapter_size=adapter_size)

bert shape (None, 192, 768)
loader: No value for:[bert_3/encoder/layer_0/attention/output/adapter-down/kernel:0], i.e.:[bert/encoder/layer_0/attention/output/adapter-down/kernel] in:[.model/chinese_L-12_H-768_A-12/bert_model.ckpt]
loader: No value for:[bert_3/encoder/layer_0/attention/output/adapter-down/bias:0], i.e.:[bert/encoder/layer_0/attention/output/adapter-down/bias] in:[.model/chinese_L-12_H-768_A-12/bert_model.ckpt]
loader: No value for:[bert_3/encoder/layer_0/attention/output/adapter-up/kernel:0], i.e.:[bert/encoder/layer_0/attention/output/adapter-up/kernel] in:[.model/chinese_L-12_H-768_A-12/bert_model.ckpt]
loader: No value for:[bert_3/encoder/layer_0/attention/output/adapter-up/bias:0], i.e.:[bert/encoder/layer_0/attention/output/adapter-up/bias] in:[.model/chinese_L-12_H-768_A-12/bert_model.ckpt]
loader: No value for:[bert_3/encoder/layer_0/output/adapter-down/kernel:0], i.e.:[bert/encoder/layer_0/output/adapter-down/kernel] in:[.model/chinese_L-12_H-768_A-12/bert_model

In [None]:
%%time

log_dir = ".log/bert_origin/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%s")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir)
total_epoch_count = 50
# model.fit(x=(data.train_x, data.train_x_token_types), y=data.train_y,
model.fit(x=data.train_x, y=data.train_y,
          validation_split=0.1,
          batch_size=16,
          shuffle=True,
          epochs=total_epoch_count,
          callbacks=[create_learning_rate_scheduler(max_learn_rate=5e-5,
                                                    end_learn_rate=1e-6,
                                                    warmup_epoch_count=20,
                                                    total_epoch_count=total_epoch_count),
                     keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True),
                     tensorboard_callback])

model.save_weights('./models/bert'+ datetime.datetime.now().strftime("%Y%m%d-%H%M%s") +'.h5', overwrite=True)

Train on 19991 samples, validate on 2222 samples

Epoch 00001: LearningRateScheduler reducing learning rate to 2.5e-06.
Epoch 1/50

Epoch 00002: LearningRateScheduler reducing learning rate to 5e-06.
Epoch 2/50

Epoch 00003: LearningRateScheduler reducing learning rate to 7.500000000000001e-06.
Epoch 3/50

Epoch 00004: LearningRateScheduler reducing learning rate to 1e-05.
Epoch 4/50

Epoch 00005: LearningRateScheduler reducing learning rate to 1.25e-05.
Epoch 5/50

Epoch 00006: LearningRateScheduler reducing learning rate to 1.5000000000000002e-05.
Epoch 6/50

Epoch 00007: LearningRateScheduler reducing learning rate to 1.7500000000000002e-05.
Epoch 7/50

Epoch 00008: LearningRateScheduler reducing learning rate to 2e-05.
Epoch 8/50

Epoch 00009: LearningRateScheduler reducing learning rate to 2.25e-05.
Epoch 9/50

Epoch 00010: LearningRateScheduler reducing learning rate to 2.5e-05.
Epoch 10/50

Epoch 00011: LearningRateScheduler reducing learning rate to 2.75e-05.
Epoch 11/50

Epoch

In [None]:
%%time

_, train_acc = model.evaluate(data.train_x, data.train_y)
_, test_acc = model.evaluate(data.test_x, data.test_y)

print("train acc", train_acc)
print(" test acc", test_acc)



# Evaluation

To evaluate the trained model, let's load the saved weights in a new model instance, and evaluate.

In [20]:
%%time 

model = create_model(data.max_seq_len, adapter_size=1)
model.load_weights("movie_reviews.h5")

_, train_acc = model.evaluate(data.train_x, data.train_y)
_, test_acc = model.evaluate(data.test_x, data.test_y)

print("train acc", train_acc)
print(" test acc", test_acc)

bert shape (None, 200, 768)
loader: No value for:[bert_1/encoder/layer_0/attention/output/adapter-down/kernel:0], i.e.:[bert/encoder/layer_0/attention/output/adapter-down/kernel] in:[.model/chinese_L-12_H-768_A-12/bert_model.ckpt]
loader: No value for:[bert_1/encoder/layer_0/attention/output/adapter-down/bias:0], i.e.:[bert/encoder/layer_0/attention/output/adapter-down/bias] in:[.model/chinese_L-12_H-768_A-12/bert_model.ckpt]
loader: No value for:[bert_1/encoder/layer_0/attention/output/adapter-up/kernel:0], i.e.:[bert/encoder/layer_0/attention/output/adapter-up/kernel] in:[.model/chinese_L-12_H-768_A-12/bert_model.ckpt]
loader: No value for:[bert_1/encoder/layer_0/attention/output/adapter-up/bias:0], i.e.:[bert/encoder/layer_0/attention/output/adapter-up/bias] in:[.model/chinese_L-12_H-768_A-12/bert_model.ckpt]
loader: No value for:[bert_1/encoder/layer_0/output/adapter-down/kernel:0], i.e.:[bert/encoder/layer_0/output/adapter-down/kernel] in:[.model/chinese_L-12_H-768_A-12/bert_model

# Prediction

For prediction, we need to prepare the input text the same way as we did for training - tokenize, adding the special `[CLS]` and `[SEP]` token at begin and end of the token sequence, and pad to match the model input shape.

In [None]:
pred_sentences = [
  "That movie was absolutely awful",
  "The acting was a bit lacking",
  "The film was creative and surprising",
  "Absolutely fantastic!"
]

tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt"))
pred_tokens    = map(tokenizer.tokenize, pred_sentences)
pred_tokens    = map(lambda tok: ["[CLS]"] + tok + ["[SEP]"], pred_tokens)
pred_token_ids = list(map(tokenizer.convert_tokens_to_ids, pred_tokens))

pred_token_ids = map(lambda tids: tids +[0]*(data.max_seq_len-len(tids)),pred_token_ids)
pred_token_ids = np.array(list(pred_token_ids))

print('pred_token_ids', pred_token_ids.shape)

res = model.predict(pred_token_ids).argmax(axis=-1)

for text, sentiment in zip(pred_sentences, res):
  print(" text:", text)
  print("  res:", ["negative","positive"][sentiment])