In [None]:
# GPU selection
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [2]:
import os
from pathlib import Path
import pickle
import multiprocessing
import numpy as np
from scipy import stats
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
import tensorflow.compat.v1 as tf
tf.logging.set_verbosity(tf.logging.WARN)
from tensorflow.compat.v1.keras import layers

from tqdm import tqdm

import bert.model
import bert.utils
import bert.optimizer

In [3]:
BERTLARGE     = False
USE_AMP       = True
USE_XLA       = True
MAX_SEQ_LEN   = 128
LEARNING_RATE = 1e-5
TUNE_LAYERS   = -1
DROPOUT_RATE  = 0.9
BATCH_SIZE    = 48

In [4]:
DATASET_PORTION = 0.001

if BERTLARGE:
    BERT_PATH = "https://tfhub.dev/google/bert_uncased_L-24_H-1024_A-16/1"
    H_SIZE = 1024
else:
    BERT_PATH = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
    H_SIZE = 768

sess = tf.Session()
tf.keras.backend.set_session(sess)

In [5]:
tokenizer = bert.model.create_tokenizer_from_hub_module(BERT_PATH, sess)

In [6]:
train_text, train_label, num_classes = bert.utils.load_ag_news_dataset(max_seq_len=MAX_SEQ_LEN,
                                                                       test=False)

if DATASET_PORTION < 1:
    num_examples = int(len(train_label) * DATASET_PORTION)
    _, train_text, _, train_label= train_test_split(train_text, train_label, test_size=DATASET_PORTION, stratify=train_label)
else:
    num_examples = len(train_label)

train_label = np.asarray(train_label)
feat = bert.model.convert_text_to_features(tokenizer, train_text, train_label, max_seq_length=MAX_SEQ_LEN, verbose=False)
(train_input_ids, train_input_masks, train_segment_ids, train_labels) = feat

print("Number of training examples:", len(train_labels))

examples, labels, num_classes = bert.utils.load_ag_news_dataset(max_seq_len=MAX_SEQ_LEN,
                                                                test=True)
labels = np.asarray(labels)
feat = bert.model.convert_text_to_features(tokenizer, examples, labels, max_seq_length=MAX_SEQ_LEN, verbose=False)
(test_input_ids, test_input_masks, test_segment_ids, test_labels) = feat

test_input_ids, test_input_masks, test_segment_ids, test_labels = shuffle(test_input_ids,
                                                                          test_input_masks,
                                                                          test_segment_ids,
                                                                          test_labels)

test_set = ([test_input_ids, test_input_masks, test_segment_ids], test_labels)

Loaded training set from: /home/jovyan/.keras/datasets/ag_news
Examples: 120000 Classes: 4
Number of training examples: 120
Loaded test set from: /home/jovyan/.keras/datasets/ag_news
Examples: 7600 Classes: 4


In [7]:
def create_model():
    tf.keras.backend.clear_session()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    if USE_XLA:
        opt_level = tf.OptimizerOptions.ON_1
        tf.enable_resource_variables()
    else:
        opt_level = tf.OptimizerOptions.OFF
    config.graph_options.optimizer_options.global_jit_level = opt_level
    config.graph_options.rewrite_options.auto_mixed_precision = USE_AMP
    sess = tf.Session(config=config)
    tf.keras.backend.set_session(sess)
    
    if USE_AMP:
        tf.keras.mixed_precision.experimental.set_policy('infer_float32_vars')

    in_id = layers.Input(shape=(MAX_SEQ_LEN,), name="input_ids")
    in_mask = layers.Input(shape=(MAX_SEQ_LEN,), name="input_masks")
    in_segment = layers.Input(shape=(MAX_SEQ_LEN,), name="segment_ids")
    in_bert = [in_id, in_mask, in_segment]
    l_bert = bert.model.BERT(fine_tune_layers=TUNE_LAYERS,
                             bert_path=BERT_PATH,
                             return_sequence=False,
                             output_size=H_SIZE,
                             debug=False)(in_bert)
    l_drop_2 = layers.Dropout(rate=DROPOUT_RATE)(l_bert)
    out_pred = layers.Dense(num_classes, activation="softmax")(l_drop_2)

    model = tf.keras.models.Model(inputs=in_bert, outputs=out_pred)

    opt = bert.optimizer.RAdam(lr=LEARNING_RATE)

    if USE_AMP:
        opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, "dynamic")

    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer=opt,
                  metrics=["accuracy"])

    model.summary()
    
    return model
    
model = create_model()

W0911 03:20:10.069056 139890185373504 deprecation_wrapper.py:119] From /home/jovyan/bert-finetune/bert/model.py:121: The name tf.keras.backend.get_session is deprecated. Please use tf.compat.v1.keras.backend.get_session instead.

W0911 03:20:19.553777 139890185373504 nn_ops.py:4248] Large dropout rate: 0.9 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.
W0911 03:20:19.571656 139890185373504 deprecation.py:506] From /opt/conda/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
bert (BERT)                     (None, 768)          110104890   input_ids[0][0]                  
                                                                 input_masks[0][0]            

In [8]:
def scheduler(epoch):
    warmup_steps = 26000
    warmup_epochs = warmup_steps//num_examples
    if epoch < warmup_epochs:
        return LEARNING_RATE*(epoch/warmup_epochs)
    else:
        return LEARNING_RATE
    
lr_schedule = tf.keras.callbacks.LearningRateScheduler(scheduler)
early_stop = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)

callbacks_list = [lr_schedule, early_stop]

In [9]:
log = model.fit([train_input_ids, train_input_masks, train_segment_ids],
                train_labels, validation_data=test_set,
                verbose=2, callbacks=callbacks_list,
                epochs=1000, batch_size=BATCH_SIZE)

Train on 120 samples, validate on 7600 samples


W0911 03:20:19.947500 139890185373504 deprecation.py:323] From /opt/conda/lib/python3.6/site-packages/tensorflow/python/ops/math_grad.py:1205: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/1000
120/120 - 65s - loss: 3.0841 - acc: 0.2833 - val_loss: 2.1188 - val_acc: 0.2489
Epoch 2/1000
120/120 - 39s - loss: 3.7698 - acc: 0.2500 - val_loss: 2.1187 - val_acc: 0.2489
Epoch 3/1000
120/120 - 5s - loss: 3.1664 - acc: 0.2917 - val_loss: 2.1186 - val_acc: 0.2489
Epoch 4/1000
120/120 - 5s - loss: 3.3819 - acc: 0.2583 - val_loss: 2.1183 - val_acc: 0.2489
Epoch 5/1000
120/120 - 5s - loss: 3.2093 - acc: 0.2167 - val_loss: 2.1178 - val_acc: 0.2489
Epoch 6/1000
120/120 - 5s - loss: 3.3698 - acc: 0.2583 - val_loss: 2.1170 - val_acc: 0.2489
Epoch 7/1000
120/120 - 5s - loss: 2.6571 - acc: 0.2583 - val_loss: 2.1159 - val_acc: 0.2489
Epoch 8/1000
120/120 - 5s - loss: 3.4120 - acc: 0.2500 - val_loss: 2.1146 - val_acc: 0.2489
Epoch 9/1000
120/120 - 5s - loss: 3.3430 - acc: 0.2667 - val_loss: 2.1129 - val_acc: 0.2488
Epoch 10/1000
120/120 - 5s - loss: 3.5564 - acc: 0.2250 - val_loss: 2.1108 - val_acc: 0.2489
Epoch 11/1000
120/120 - 5s - loss: 3.2916 - acc: 0.2500 - val_loss: 2.1082 - 

In [10]:
[eval_loss, eval_acc] = model.evaluate([test_input_ids, test_input_masks, test_segment_ids], test_labels, verbose=2, batch_size=256)
print("Loss:", eval_loss, "Acc:", eval_acc)

7600/7600 - 10s - loss: 0.4143 - acc: 0.8603
Loss: 0.4143489755454816 Acc: 0.86026317


In [11]:
train_text, train_label, num_classes = bert.utils.load_ag_news_dataset(max_seq_len=MAX_SEQ_LEN,
                                                                       test=False)

train_label = np.asarray(train_label)
feat = bert.model.convert_text_to_features(tokenizer, train_text, train_label, max_seq_length=MAX_SEQ_LEN, verbose=False)
(train_input_ids, train_input_masks, train_segment_ids, train_labels) = feat 

Loaded training set from: /home/jovyan/.keras/datasets/ag_news
Examples: 120000 Classes: 4


In [12]:
"""
len_list = []
for example_mask in train_input_masks.tolist():
    len_list.append(np.sum(example_mask))
print(stats.describe(np.asarray(len_list)))
"""

'\nlen_list = []\nfor example_mask in train_input_masks.tolist():\n    len_list.append(np.sum(example_mask))\nprint(stats.describe(np.asarray(len_list)))\n'

In [13]:
y_softmax_list = []

num_iterations = 30

for _ in range(num_iterations):
    with tf.keras.backend.learning_phase_scope(1):
        y_pred = model.predict([train_input_ids, train_input_masks, train_segment_ids], verbose=2, batch_size=256)
    y_softmax_list.append(y_pred)

120000/120000 - 71s
120000/120000 - 69s
120000/120000 - 64s
120000/120000 - 64s
120000/120000 - 64s
120000/120000 - 64s
120000/120000 - 64s
120000/120000 - 64s
120000/120000 - 64s
120000/120000 - 64s
120000/120000 - 64s
120000/120000 - 64s
120000/120000 - 64s
120000/120000 - 64s
120000/120000 - 64s
120000/120000 - 64s
120000/120000 - 64s
120000/120000 - 64s
120000/120000 - 64s
120000/120000 - 64s
120000/120000 - 64s
120000/120000 - 64s
120000/120000 - 64s
120000/120000 - 64s
120000/120000 - 64s
120000/120000 - 64s
120000/120000 - 64s
120000/120000 - 64s
120000/120000 - 64s
120000/120000 - 64s


In [20]:
agg_y_softmax = np.stack(y_softmax_list, axis=-2)

probs = np.sum(agg_y_softmax, axis=1)/num_iterations
probs = probs.tolist()

margin_list = []

threshold = 0.6

false_positive = 0
positive = 0

for i, prob in enumerate(probs):
    margin = max(prob) - min(prob)
    if margin < threshold:
        margin_list.append(margin)
        pred = np.argmax(prob)
        if pred == train_label[i]:
            false_positive += 1
        else:
            positive += 1
            
print("Threshold <", threshold)
print("Correct wrong:", positive)
print("Incorrect wrong:", false_positive)
print("Useful effort:", positive/(positive+false_positive))
    
acc = []

for i, prob in enumerate(probs):
    pred = np.argmax(prob)
    if pred == train_label[i]:
        acc.append(1)
    else:
        acc.append(0)
        
acc = sum(acc)/len(probs)
print("Overall accuracy:", acc)

Threshold < 0.6
Correct wrong: 7128
Incorrect wrong: 7744
Useful effort: 0.47928994082840237
Overall accuracy: 0.8627666666666667
