In [None]:
# GPU selection
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [None]:
import os
from pathlib import Path
import pickle
import multiprocessing
import numpy as np
from scipy import stats
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
import tensorflow.compat.v1 as tf
tf.logging.set_verbosity(tf.logging.WARN)
from tensorflow.compat.v1.keras import layers

from tqdm import tqdm

import bert.model
import bert.utils
import bert.optimizer

In [None]:
BERTLARGE     = False
USE_AMP       = True
USE_XLA       = True
MAX_SEQ_LEN   = 128
LEARNING_RATE = 1e-5
TUNE_LAYERS   = -1
DROPOUT_RATE  = 0.5
BATCH_SIZE    = 48

In [None]:
DATASET_PORTION = 0.005

if BERTLARGE:
    BERT_PATH = "https://tfhub.dev/google/bert_uncased_L-24_H-1024_A-16/1"
    H_SIZE = 1024
else:
    BERT_PATH = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
    H_SIZE = 768

sess = tf.Session()
tf.keras.backend.set_session(sess)

In [None]:
tokenizer = bert.model.create_tokenizer_from_hub_module(BERT_PATH, sess)

In [None]:
train_text, train_label, num_classes = bert.utils.load_ag_news_dataset(max_seq_len=MAX_SEQ_LEN,
                                                                       test=False)

if DATASET_PORTION < 1:
    num_examples = int(len(train_label) * DATASET_PORTION)
    _, train_text, _, train_label= train_test_split(train_text, train_label, test_size=DATASET_PORTION, stratify=train_label)
else:
    num_examples = len(train_label)

train_label = np.asarray(train_label)
feat = bert.model.convert_text_to_features(tokenizer, train_text, train_label, max_seq_length=MAX_SEQ_LEN, verbose=False)
(train_input_ids, train_input_masks, train_segment_ids, train_labels) = feat

print("Number of training examples:", len(train_labels))

examples, labels, num_classes = bert.utils.load_ag_news_dataset(max_seq_len=MAX_SEQ_LEN,
                                                                test=True)
labels = np.asarray(labels)
feat = bert.model.convert_text_to_features(tokenizer, examples, labels, max_seq_length=MAX_SEQ_LEN, verbose=False)
(test_input_ids, test_input_masks, test_segment_ids, test_labels) = feat

test_input_ids, test_input_masks, test_segment_ids, test_labels = shuffle(test_input_ids,
                                                                          test_input_masks,
                                                                          test_segment_ids,
                                                                          test_labels)

test_set = ([test_input_ids, test_input_masks, test_segment_ids], test_labels)

In [None]:
def create_model():
    tf.keras.backend.clear_session()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    if USE_XLA:
        opt_level = tf.OptimizerOptions.ON_1
        tf.enable_resource_variables()
    else:
        opt_level = tf.OptimizerOptions.OFF
    config.graph_options.optimizer_options.global_jit_level = opt_level
    config.graph_options.rewrite_options.auto_mixed_precision = USE_AMP
    sess = tf.Session(config=config)
    tf.keras.backend.set_session(sess)
    
    if USE_AMP:
        tf.keras.mixed_precision.experimental.set_policy('infer_float32_vars')

    in_id = layers.Input(shape=(MAX_SEQ_LEN,), name="input_ids")
    in_mask = layers.Input(shape=(MAX_SEQ_LEN,), name="input_masks")
    in_segment = layers.Input(shape=(MAX_SEQ_LEN,), name="segment_ids")
    in_bert = [in_id, in_mask, in_segment]
    l_bert = bert.model.BERT(fine_tune_layers=TUNE_LAYERS,
                             bert_path=BERT_PATH,
                             return_sequence=True,
                             output_size=H_SIZE,
                             debug=False)(in_bert)
    l_bert = layers.Reshape((MAX_SEQ_LEN, H_SIZE))(l_bert)
    l_drop_1 = layers.SpatialDropout1D(rate=DROPOUT_RATE)(l_bert)
    l_conv = layers.Conv1D(H_SIZE//2, 1)(l_drop_1)
    l_flat = layers.Flatten()(l_conv)
    l_drop_2 = layers.Dropout(rate=DROPOUT_RATE)(l_flat)
    out_pred = layers.Dense(num_classes, activation="softmax")(l_drop_2)

    model = tf.keras.models.Model(inputs=in_bert, outputs=out_pred)

    opt = bert.optimizer.RAdam(lr=LEARNING_RATE)

    if USE_AMP:
        opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, "dynamic")

    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer=opt,
                  metrics=["accuracy"])

    model.summary()
    
    return model
    
model = create_model()

In [None]:
def scheduler(epoch):
    warmup_steps = 26000
    warmup_epochs = warmup_steps//num_examples
    if epoch < warmup_epochs:
        return LEARNING_RATE*(epoch/warmup_epochs)
    else:
        return LEARNING_RATE
    
lr_schedule = tf.keras.callbacks.LearningRateScheduler(scheduler)
early_stop = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)

callbacks_list = [lr_schedule, early_stop]

In [None]:
log = model.fit([train_input_ids, train_input_masks, train_segment_ids],
                train_labels, validation_data=test_set,
                verbose=2, callbacks=callbacks_list,
                epochs=1000, batch_size=BATCH_SIZE)

In [None]:
[eval_loss, eval_acc] = model.evaluate([test_input_ids, test_input_masks, test_segment_ids], test_labels, verbose=2, batch_size=256)
print("Loss:", eval_loss, "Acc:", eval_acc)

In [None]:
train_text, train_label, num_classes = bert.utils.load_ag_news_dataset(max_seq_len=MAX_SEQ_LEN,
                                                                       test=False)

train_label = np.asarray(train_label)
feat = bert.model.convert_text_to_features(tokenizer, train_text, train_label, max_seq_length=MAX_SEQ_LEN, verbose=False)
(train_input_ids, train_input_masks, train_segment_ids, train_labels) = feat 

In [None]:
y_softmax_list = []

num_iterations = 10

for _ in range(num_iterations):
    with tf.keras.backend.learning_phase_scope(1):
        y_pred = model.predict([train_input_ids, train_input_masks, train_segment_ids], verbose=2, batch_size=256)
    y_softmax_list.append(y_pred)

In [None]:
import copy

In [None]:
agg_y_softmax = np.stack(y_softmax_list, axis=-2)

probs = np.sum(agg_y_softmax, axis=1)/num_iterations
probs = probs.tolist()

margin_list = []

threshold = 0.3

false_positive = 0
positive = 0

for i, prob in enumerate(probs):
    # calculate a margin:
    # top_pred - (sum of other preds)
    copy_prob = copy.deepcopy(prob)
    copy_prob.sort()
    margin = copy_prob[-1] - copy_prob[0]
    if margin < threshold:
        margin_list.append(margin)
        pred = np.argmax(prob)
        if pred == train_label[i]:
            false_positive += 1
        else:
            positive += 1
            
print("Threshold <", threshold)
print("Correct wrong:", positive)
print("Incorrect wrong:", false_positive)
print("Useful effort:", positive/(positive+false_positive))
    
acc = []

for i, prob in enumerate(probs):
    pred = np.argmax(prob)
    if pred == train_label[i]:
        acc.append(1)
    else:
        acc.append(0)
        
acc = sum(acc)/len(probs)
print("Overall accuracy:", acc)