# BERT Clustering Test

In [1]:
# GPU selection
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [2]:
import os
from pathlib import Path
import pickle
    
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
import tensorflow.compat.v1 as tf
tf.logging.set_verbosity(tf.logging.ERROR)
from tensorflow.compat.v1.keras import layers
import horovod.tensorflow as hvd
import horovod.tensorflow.keras as hvd_keras

import utils
import bert_utils
import bert_optimizer

In [3]:
BERTLARGE = True
USE_AMP = True
USE_XLA = True
MAX_SEQ_LEN = 128

In [4]:
if BERTLARGE:
    BERT_PATH = "https://tfhub.dev/google/bert_uncased_L-24_H-1024_A-16/1"
    H_SIZE = 1024
else:
    BERT_PATH = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
    H_SIZE = 768

Create TensorFlow session

In [5]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
if USE_XLA:
    opt_level = tf.OptimizerOptions.ON_1
    tf.enable_resource_variables()
else:
    opt_level = tf.OptimizerOptions.OFF
config.graph_options.optimizer_options.global_jit_level = opt_level
config.graph_options.rewrite_options.auto_mixed_precision = USE_AMP
sess = tf.Session(config=config)
tf.keras.backend.set_session(sess)

Create Tokenizer

In [6]:
tokenizer = bert_utils.create_tokenizer_from_hub_module(BERT_PATH, sess)

Load dataset

In [7]:
train_text, train_label, num_classes = utils.load_ag_news_dataset(max_seq_len=MAX_SEQ_LEN,
                                                                  test=False)

Loaded training set from: /home/jovyan/.keras/datasets/ag_news
Examples: 120000 Classes: 4


In [8]:
train_feat_cache = "./cache2/train_feat.pickle"
train_feat = Path(train_feat_cache)

if train_feat.is_file():
    feat = pickle.load(open(train_feat_cache, "rb"))
else:
    os.makedirs("./cache2/", exist_ok=True)
    train_label = np.asarray(train_label)
    train_examples = bert_utils.convert_text_to_examples(train_text, train_label)
    feat = bert_utils.convert_examples_to_features(tokenizer,
                                                   train_examples,
                                                   max_seq_length=MAX_SEQ_LEN,
                                                   verbose=1)
    pickle.dump(feat, open(train_feat_cache, "wb"))

(train_input_ids, train_input_masks, train_segment_ids, train_labels) = feat

train_input_ids, train_input_masks, train_segment_ids, train_labels = shuffle(train_input_ids,
                                                                              train_input_masks,
                                                                              train_segment_ids,
                                                                              train_labels)

In [9]:
test_feat_cache = "./cache2/test_feat.pickle"
test_feat = Path(test_feat_cache)

if test_feat.is_file():
    feat = pickle.load(open(test_feat_cache, "rb"))
else:
    os.makedirs("./cache2/", exist_ok=True)
    examples, labels, num_classes = utils.load_ag_news_dataset(max_seq_len=MAX_SEQ_LEN,
                                                               test=True)
    labels = np.asarray(labels)
    test_examples = bert_utils.convert_text_to_examples(examples, labels)
    feat = bert_utils.convert_examples_to_features(tokenizer,
                                                   test_examples,
                                                   max_seq_length=MAX_SEQ_LEN,
                                                   verbose=1)
    pickle.dump(feat, open(test_feat_cache, "wb"))

(test_input_ids, test_input_masks, test_segment_ids, test_labels) = feat

test_input_ids, test_input_masks, test_segment_ids, test_labels = shuffle(test_input_ids,
                                                                          test_input_masks,
                                                                          test_segment_ids,
                                                                          test_labels)

test_set = ([test_input_ids, test_input_masks, test_segment_ids], test_labels)

Create the model

In [19]:
if USE_AMP:
    tf.keras.mixed_precision.experimental.set_policy('infer_float32_vars')

in_id = layers.Input(shape=(MAX_SEQ_LEN,), name="input_ids")
in_mask = layers.Input(shape=(MAX_SEQ_LEN,), name="input_masks")
in_segment = layers.Input(shape=(MAX_SEQ_LEN,), name="segment_ids")

in_bert = [in_id, in_mask, in_segment]

l_bert = bert_utils.BERT(fine_tune_layers=-1,
                         bert_path=BERT_PATH,
                         return_sequence=True,
                         output_size=H_SIZE,
                         debug=False)(in_bert)
x = layers.AveragePooling1D()(l_bert)

model = tf.keras.models.Model(inputs=in_bert, outputs=x)

In [20]:
opt = tf.keras.optimizers.Adam(lr=1e-5, decay=0.0)
if USE_AMP:
    opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, "dynamic")

In [21]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer=opt,
              metrics=["accuracy"])

In [22]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
bert_1 (BERT)                   (None, None, 1024)   336224058   input_ids[0][0]                  
                                                                 input_masks[0][0]          

In [14]:
sess.run(tf.local_variables_initializer())
sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())
tf.keras.backend.set_session(sess)

In [16]:
results = model.predict([train_input_ids, train_input_masks, train_segment_ids],
                        verbose=1, batch_size=512)



In [17]:
results_flat = []

for result in results:
    results_flat.append(result.flatten())

In [18]:
pickle.dump(results_flat, open("./cache2/features.pickle", "wb"))