# BERT Training

Single GPU BERT Training Notebook

In [1]:
# GPU selection
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [23]:
import warnings
warnings.filterwarnings("ignore")

from pathlib import Path
import pickle
import multiprocessing
import numpy as np
from scipy import stats
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
import tensorflow.compat.v1 as tf
tf.logging.set_verbosity(tf.logging.ERROR)
from tensorflow.compat.v1.keras import layers

import utils
import bert_utils
import bert_optimizer

## Notebook Parameters

In [3]:
BERTLARGE       = False
USE_AMP         = True
USE_XLA         = True
MAX_SEQ_LEN     = 128
LEARNING_RATE   = 2e-5
TUNE_LAYERS     = -1
DATASET_PORTION = 0.01

In [4]:
if BERTLARGE:
    BERT_PATH = "https://tfhub.dev/google/bert_uncased_L-24_H-1024_A-16/1"
    H_SIZE = 1024
else:
    BERT_PATH = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
    H_SIZE = 768

## Create TensorFlow session

In [5]:
os.environ["TF_GPU_THREAD_MODE"] = "gpu_private"
os.environ["TF_GPU_THREAD_COUNT"] = "2"

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
if USE_XLA:
    opt_level = tf.OptimizerOptions.ON_1
    tf.enable_resource_variables()
else:
    opt_level = tf.OptimizerOptions.OFF
config.graph_options.optimizer_options.global_jit_level = opt_level
config.graph_options.rewrite_options.auto_mixed_precision = USE_AMP
sess = tf.Session(config=config)
tf.keras.backend.set_session(sess)

## Load Dataset

### Create Tokenizer

In [6]:
tokenizer = bert_utils.create_tokenizer_from_hub_module(BERT_PATH, sess)

### Preprocess Data

In [7]:
train_text, train_label, num_classes = utils.load_ag_news_dataset(max_seq_len=MAX_SEQ_LEN,
                                                                  test=False)

if DATASET_PORTION < 1:
    num_examples = int(len(train_label) * DATASET_PORTION)
    _, train_text, _, train_label= train_test_split(train_text, train_label, test_size=DATASET_PORTION, stratify=train_label)
else:
    num_examples = len(train_label)

train_label = np.asarray(train_label)
num_examples = len(train_label)
train_examples = bert_utils.convert_text_to_examples(train_text, train_label)
feat = bert_utils.convert_examples_to_features(tokenizer,
                                               train_examples,
                                               max_seq_length=MAX_SEQ_LEN,
                                               verbose=True)

(train_input_ids, train_input_masks, train_segment_ids, train_labels) = feat

train_input_ids, train_input_masks, train_segment_ids, train_labels = shuffle(train_input_ids,
                                                                              train_input_masks,
                                                                              train_segment_ids,
                                                                              train_labels)

Converting examples to features:  10%|█         | 122/1200 [00:00<00:00, 1214.20it/s]

Loaded training set from: /home/jovyan/.keras/datasets/ag_news
Examples: 120000 Classes: 4


Converting examples to features: 100%|██████████| 1200/1200 [00:00<00:00, 1227.60it/s]


In [8]:
examples, labels, num_classes = utils.load_ag_news_dataset(max_seq_len=MAX_SEQ_LEN,
                                                           test=True)
labels = np.asarray(labels)
test_examples = bert_utils.convert_text_to_examples(examples, labels)
feat = bert_utils.convert_examples_to_features(tokenizer,
                                               test_examples,
                                               max_seq_length=MAX_SEQ_LEN,
                                               verbose=True)

(test_input_ids, test_input_masks, test_segment_ids, test_labels) = feat

test_input_ids, test_input_masks, test_segment_ids, test_labels = shuffle(test_input_ids,
                                                                          test_input_masks,
                                                                          test_segment_ids,
                                                                          test_labels)

test_set = ([test_input_ids, test_input_masks, test_segment_ids], test_labels)

Converting examples to features:   2%|▏         | 126/7600 [00:00<00:05, 1250.32it/s]

Loaded test set from: /home/jovyan/.keras/datasets/ag_news
Examples: 7600 Classes: 4


Converting examples to features: 100%|██████████| 7600/7600 [00:06<00:00, 1228.91it/s]


## Build Keras Model

In [9]:
class MCDropout(tf.keras.layers.Dropout):
    def call(self, inputs):
        return super().call(inputs, training=True)

In [10]:
if USE_AMP:
    tf.keras.mixed_precision.experimental.set_policy('infer_float32_vars')

in_id = layers.Input(shape=(MAX_SEQ_LEN,), name="input_ids")
in_mask = layers.Input(shape=(MAX_SEQ_LEN,), name="input_masks")
in_segment = layers.Input(shape=(MAX_SEQ_LEN,), name="segment_ids")

in_bert = [in_id, in_mask, in_segment]
l_bert = bert_utils.BERT(fine_tune_layers=TUNE_LAYERS,
                         bert_path=BERT_PATH,
                         return_sequence=False,
                         output_size=H_SIZE,
                         debug=False)(in_bert)
l_drop = MCDropout(rate=0.5)(l_bert)
out_pred = layers.Dense(num_classes, activation="softmax")(l_drop)

model = tf.keras.models.Model(inputs=in_bert, outputs=out_pred)

In [11]:
#opt = tf.keras.optimizers.Adam(lr=LEARNING_RATE)
opt = bert_optimizer.RAdam(lr=LEARNING_RATE)

if USE_AMP:
    opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, "dynamic")

In [12]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer=opt,
              metrics=["accuracy"])

In [13]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
bert (BERT)                     (None, 768)          110104890   input_ids[0][0]                  
                                                                 input_masks[0][0]            

## Train Model

In [14]:
def scheduler(epoch):
    warmup_steps = 26000
    warmup_epochs = warmup_steps//num_examples
    if epoch < warmup_epochs:
        return LEARNING_RATE*(epoch/warmup_epochs)
    else:
        return LEARNING_RATE

lr_schedule = tf.keras.callbacks.LearningRateScheduler(scheduler)
early_stop = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)

callbacks_list = [lr_schedule, early_stop]

In [15]:
log = model.fit([train_input_ids, train_input_masks, train_segment_ids],
                train_labels, validation_data=test_set,
                workers=4, use_multiprocessing=True,
                verbose=2, callbacks=callbacks_list,
                epochs=1000, batch_size=56)

Train on 1200 samples, validate on 7600 samples
Epoch 1/1000
1200/1200 - 70s - loss: 1.8929 - acc: 0.2608 - val_loss: 1.9099 - val_acc: 0.2563
Epoch 2/1000
1200/1200 - 41s - loss: 1.9078 - acc: 0.2575 - val_loss: 1.8690 - val_acc: 0.2617
Epoch 3/1000
1200/1200 - 8s - loss: 1.8701 - acc: 0.2400 - val_loss: 1.7860 - val_acc: 0.2654
Epoch 4/1000
1200/1200 - 8s - loss: 1.7478 - acc: 0.2675 - val_loss: 1.6429 - val_acc: 0.2947
Epoch 5/1000
1200/1200 - 8s - loss: 1.5300 - acc: 0.3258 - val_loss: 1.5222 - val_acc: 0.3259
Epoch 6/1000
1200/1200 - 8s - loss: 1.4732 - acc: 0.3450 - val_loss: 1.4603 - val_acc: 0.3475
Epoch 7/1000
1200/1200 - 8s - loss: 1.4466 - acc: 0.3692 - val_loss: 1.3668 - val_acc: 0.3957
Epoch 8/1000
1200/1200 - 8s - loss: 1.3124 - acc: 0.4383 - val_loss: 1.2634 - val_acc: 0.4471
Epoch 9/1000
1200/1200 - 8s - loss: 1.2205 - acc: 0.4717 - val_loss: 1.1633 - val_acc: 0.5014
Epoch 10/1000
1200/1200 - 8s - loss: 1.0657 - acc: 0.5625 - val_loss: 1.0229 - val_acc: 0.5763
Epoch 11/

In [16]:
[eval_loss, eval_acc] = model.evaluate([test_input_ids, test_input_masks, test_segment_ids], test_labels, verbose=2, batch_size=256)
print("Loss:", eval_loss, "Acc:", eval_acc)

7600/7600 - 10s - loss: 0.3699 - acc: 0.8786
Loss: 0.36994663483218143 Acc: 0.8785526


In [17]:
train_text, train_label, num_classes = utils.load_ag_news_dataset(max_seq_len=MAX_SEQ_LEN,
                                                                  test=False)

train_label = np.asarray(train_label)
train_examples = bert_utils.convert_text_to_examples(train_text, train_label)

feat = bert_utils.convert_examples_to_features(tokenizer,
                                               train_examples,
                                               max_seq_length=MAX_SEQ_LEN,
                                               verbose=False)

(train_input_ids, train_input_masks, train_segment_ids, train_labels) = feat   

Loaded training set from: /home/jovyan/.keras/datasets/ag_news
Examples: 120000 Classes: 4


In [18]:
y_pred_list = []

for _ in range(10):
    y_pred = model.predict([train_input_ids, train_input_masks, train_segment_ids], verbose=2, batch_size=256)
    y_pred_class = np.argmax(y_pred, axis=1)
    y_pred_list.append(y_pred_class)
    
agg_y_pred = np.stack(y_pred_list, axis=-1)

120000/120000 - 71s
120000/120000 - 69s
120000/120000 - 64s
120000/120000 - 64s
120000/120000 - 64s
120000/120000 - 64s
120000/120000 - 64s
120000/120000 - 64s
120000/120000 - 64s
120000/120000 - 64s


In [28]:
from tqdm import tqdm

In [47]:
acc = 0
corr_guess = 0
wrong = 0
false_pos = 0
false_neg = 0

for i, truth in tqdm(enumerate(train_label), total=len(train_label)):
    pred = agg_y_pred[i, :]
    m = stats.mode(pred).mode
    correct = truth==m
    
    diff = 0
    for item in pred:
        if item != m:
            diff += 1
            
    if diff > 5:
        same = False
    else:
        same = True
    
    if correct and same:
        # correct guess as correct
        acc += 1
    if correct and not same:
        # wrong guess as wrong
        false_pos += 1
        wrong += 1
    if not correct and same:
        # wrong guess as correct
        false_neg += 1
        wrong += 1
    if not correct and not same:
        # correct guess as wrong
        corr_guess += 1
        acc += 1

100%|██████████| 120000/120000 [00:22<00:00, 5280.89it/s]


In [48]:
print("Correct guesses:", acc)
print("Wrong guesses:", wrong, ";", round(wrong/120000*100,1), "%")
print("Correct guess as wrong:", corr_guess)
print("Wrong guess as wrong:", false_pos)

Correct guesses: 106786
Wrong guesses: 13214 ; 11.0 %
Correct guess as wrong: 101
Wrong guess as wrong: 55
