## An example of BERT
ref 
1. https://github.com/google-research/bert/blob/master/predicting_movie_reviews_with_bert_on_tf_hub.ipynb
2. https://www.kdnuggets.com/2020/02/intent-recognition-bert-keras-tensorflow.html

In [0]:
## use bert for tensorflow 2
!pip install bert-for-tf2

In [0]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime
import os
import re
from tqdm import tqdm

In [0]:
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from bert.tokenization.bert_tokenization import FullTokenizer

### Load data

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
folder_path = "/content/drive/My Drive/Colab Notebooks/data/"
train = pd.read_csv(folder_path + "train_set.csv")
test = pd.read_csv(folder_path + "test_set.csv")

### Load bert model

In [0]:
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
!unzip uncased_L-12_H-768_A-12.zip

In [0]:
bert_folder = "uncased_L-12_H-768_A-12/"
bert_config_file = os.path.join(bert_folder, "bert_config.json")
bert_ckpt_file = os.path.join(bert_folder, "bert_model.ckpt")
bert_vocab_file = os.path.join(bert_folder, "vocab.txt")

### Preprocessing

In [0]:
tokenizer = FullTokenizer(vocab_file=bert_vocab_file)

In [0]:
tokenizer.tokenize("This here's an example of using the BERT tokenizer")

In [0]:
tokens = tokenizer.tokenize("This here's an example of using the BERT tokenizer")
tokenizer.convert_tokens_to_ids(tokens)

In [0]:
max_len = 64

In [0]:
label_dict = {
    "company" : 0,
    "tasks": 1,
    "profile": 2,
    "benefits": 3
}

In [0]:
def _convert_single(input_text):
  tokens = tokenizer.tokenize(input_text)
  tokens = ["[CLS]"] + tokens + ["[SEP]"]
  token_ids = tokenizer.convert_tokens_to_ids(tokens)
  return token_ids

def _convert_multiple(input_list):
  token_ids_list = []
  max_len = 1
  for sent in tqdm(input_list):
    token_ids = _convert_single(sent)
    token_ids_list.append(token_ids)
  return token_ids_list

def _pad(token_ids_list):
  x_padded = []
  for input_ids in token_ids_list:
    input_ids = input_ids[:min(len(input_ids), max_len - 2)]
    input_ids = input_ids + [0] * (max_len - len(input_ids))
    x_padded.append(np.array(input_ids))
  return np.array(x_padded)

def convert(input_list):
  token_ids_list = _convert_multiple(input_list)
  out_array = _pad(token_ids_list)
  return out_array

In [0]:
X_train = convert(train.sentence)
X_test = convert(test.sentence)

In [0]:
y_train = train.label.map(label_dict).values
y_test = test.label.map(label_dict).values

In [0]:
classes = np.array([0,1,2,3])

### Build model

In [0]:
def create_model():

  with tf.io.gfile.GFile(bert_config_file, "r") as reader:
      bc = StockBertConfig.from_json_string(reader.read())
      bert_params = map_stock_config_to_params(bc)
      bert_params.adapter_size = None
      bert = BertModelLayer.from_params(bert_params, name="bert")

  input_ids = tf.keras.layers.Input(
    shape=(max_len, ),
    dtype='int32',
    name="input_ids"
  )
  bert_output = bert(input_ids)

  print("bert shape", bert_output.shape)

  cls_out = tf.keras.layers.Lambda(lambda seq: seq[:, 0, :])(bert_output)
  cls_out = tf.keras.layers.Dropout(0.5)(cls_out)
  logits = tf.keras.layers.Dense(units=768, activation="tanh")(cls_out)
  logits = tf.keras.layers.Dropout(0.5)(logits)
  logits = tf.keras.layers.Dense(
    units=len(classes),
    activation="softmax"
  )(logits)

  model = tf.keras.Model(inputs=input_ids, outputs=logits)
  model.build(input_shape=(None, max_len))

  load_stock_weights(bert, bert_ckpt_file)

  return model

In [0]:
model = create_model()

In [0]:
model.summary()

In [0]:
model.compile(
  optimizer=tf.keras.optimizers.Adam(1e-5),
  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
  metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="acc")]
)

In [0]:
log_dir = "log/" + datetime.now().strftime("%Y%m%d-%H%M%s")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir)

history = model.fit(
  x=X_train, 
  y=y_train,
  validation_split=0.2,
  batch_size=64,
  shuffle=True,
  epochs=5,
  callbacks=[tensorboard_callback]
)