# __Test another way to finetune BERT__

Based on [Classify text with BERT](https://www.tensorflow.org/text/tutorials/classify_text_with_bert) from TensorflowHub.

## __Setup__

### _Install_

For tensorflow
```bash
  pip install -U "tensorflow-text==2.8.*"
  #pip install tf-models-official==2.7.0
  # Run into tensorflow_model module not found error. 
  # Try without specify version
  pip install tf-models-official
````

For running in vscode:
```bash
  conda install -n bert_finetune ipykernel --update-deps --force-reinstall
````

### _Import_

In [1]:
import os
import json
import pandas as pd
from pathlib import Path
from tqdm import tqdm

from sklearn import model_selection

In [2]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import tensorflow_models as tfm
from official.nlp import optimization

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

  from .autonotebook import tqdm as notebook_tqdm


### _Configuration info_

In [3]:
# Reproducibility
seed = 20220609

# Setting paths
work_dir          = Path.home() / "projects/plant_sci_hist/2_text_classify"
corpus_combo_file = work_dir / "corpus_combo"

# Dataset
batch_size     = 32
shuffle_buffer = 2

# https://stackoverflow.com/questions/56613155/tensorflow-tf-data-autotune
# tf.data builds a performance model of the input pipeline and runs an 
# optimization algorithm to find a good allocation of its CPU budget across all
# parameters specified as AUTOTUNE
AUTOTUNE = tf.data.AUTOTUNE

# maximum number of tokens in a document
max_length = 512


## __Get text ready__

### _Read json to dataframe_

In [4]:
def split_train_validate_test(corpus_combo_file, rand_state):
  '''Load data and split train, validation, test subsets for the cleaned texts
  Args:
    corpus_combo_file (str): path to the json data file
    rand_state (int): for reproducibility
  Return:
    train, test, test (pandas dataframes): training, validation, testing sets
  '''
  # Load json file
  with corpus_combo_file.open("r+") as f:
      corpus_combo_json = json.load(f)

  # Convert json back to dataframe
  corpus_combo = pd.read_json(corpus_combo_json)

  # Cleaned corpus
  corpus = corpus_combo[['label','txt']]

  # Split train test
  train, test = model_selection.train_test_split(corpus, 
      test_size=0.2, stratify=corpus['label'], random_state=rand_state)

  # Split train validate
  train, valid = model_selection.train_test_split(train, 
      test_size=0.25, stratify=train['label'], random_state=rand_state)

  return train, valid, test

In [5]:
train, valid, test = split_train_validate_test(corpus_combo_file, seed)

### _Save text entires into files_

Follow the same structure as the IMDB dataset in the `aclImdb` folder.

In [6]:
corpus_dir = work_dir / "corpus_dir"

# Create train, valid, test dir
train_dir  = corpus_dir / 'train'
valid_dir  = corpus_dir / 'valid'
test_dir   = corpus_dir / 'test'

train_dir.mkdir(parents=True, exist_ok=True)
valid_dir.mkdir(parents=True, exist_ok=True)
test_dir.mkdir(parents=True, exist_ok=True)

In [7]:
# Create pos, neg directory for each
(train_dir / "pos").mkdir(parents=True, exist_ok=True)
(train_dir / "neg").mkdir(parents=True, exist_ok=True)
(valid_dir / "pos").mkdir(parents=True, exist_ok=True)
(valid_dir / "neg").mkdir(parents=True, exist_ok=True)
(test_dir  / "pos").mkdir(parents=True, exist_ok=True)
(test_dir  / "neg").mkdir(parents=True, exist_ok=True)

In [8]:
def save_entry_to_file(df, target_dir):
  '''Save each text entry in the dataframe as a file
  '''

  labels = df['label'].values
  txts   = df['txt'].values
  c_dict = {0:0, 1:0}
  for count, label in tqdm(enumerate(labels), total=len(labels)):
    if label == 0:
      c_dict[0] += 1
      with open(target_dir / f"neg/{count}.txt", "w") as f:
        f.write(txts[count])
    else:
      c_dict[1] += 1
      with open(target_dir / f"pos/{count}.txt", "w") as f:
        f.write(txts[count])

  print(c_dict)

In [None]:
save_entry_to_file(train, train_dir)
save_entry_to_file(valid, valid_dir)
save_entry_to_file(test , test_dir)

### _Create datasets_

In [None]:
# Create train dataset
raw_train_ds = tf.keras.utils.text_dataset_from_directory(str(train_dir), 
                                                          batch_size=batch_size)
type(raw_train_ds)

In [None]:
class_names = raw_train_ds.class_names
train_ds    = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)
class_names, type(train_ds)

In [None]:
# Get valition set
raw_valid_ds = tf.keras.utils.text_dataset_from_directory(str(valid_dir), 
                                                          batch_size=batch_size)
valid_ds     = raw_valid_ds.cache().prefetch(buffer_size=AUTOTUNE)

# Get test set
raw_test_ds  = tf.keras.utils.text_dataset_from_directory(str(test_dir), 
                                                          batch_size=batch_size)
test_ds      = raw_test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
# Check out one record
for text_batch, label_batch in train_ds.take(1):
  for i in range(1):
    print(f'Text: {text_batch.numpy()[i]}')
    label = label_batch.numpy()[i]
    print(f'Label : {label} ({class_names[label]})')

## __Define Hub models and initial testing__

### _Hub models to use_

Use BERT trained on MEDLINE/Pubmed:
- https://tfhub.dev/google/experts/bert/pubmed/2

In [None]:
tfhub_encoder = 'https://tfhub.dev/google/experts/bert/pubmed/2'
tfhub_preproc = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

### _Load and test preprocessing model_

In [None]:
bert_preprocess_model = hub.KerasLayer(tfhub_preproc)
type(bert_preprocess_model)

In [None]:
# Note this is not a layer but a saved model
bert_preprocess = hub.load(tfhub_preproc)
type(bert_preprocess)

In [None]:
text_test = ['This paper is about Plant, like maize, rice, and tomato!']

#######################
# CRITICAL STEP!!! NEED TO CHANGE DIMENSION From 128 to 512
#######################

tok = bert_preprocess.tokenize(tf.constant(text_test))
text_preprocessed = bert_preprocess.bert_pack_inputs([tok, tok], 
                                                     tf.constant(max_length))
#text_preprocessed = bert_preprocess_model(text_test)

print(f'Keys       : {list(text_preprocessed.keys())}')

# The size is 128.
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :30]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :30]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :30]}')

### _Load and test BERT model_

In [None]:
bert_model = hub.KerasLayer(tfhub_encoder)
type(bert_model)

In [None]:
bert_results = bert_model(text_preprocessed)
print(f'Loaded BERT: {tfhub_encoder}')

In [None]:
# pooled_output: embedding of the document
# 768: size of the embedding vector
print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :7]}')

In [None]:
# sequence_output: embeddings of each token
# 512: number of tokens of text_preprocessed
# 768: size of the embedding vector
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :2]}')

In [None]:
# encoder_outputs: intermediate activation of a transformer block
# Q: Assuming activation is the output value of the activation function.
# 12: number of transformer blocks
print(f'Encoder Outputs length:{len(bert_results["encoder_outputs"])}')

# Saem as sequence output values
print(f'Sequence Outputs shape:{bert_results["encoder_outputs"][0].shape}')
print(f'Sequence Outputs shape:{bert_results["encoder_outputs"][0][0, :2]}')

## __Build classification model__

6/18/19
- The challenge is how to use bert_pack_inputs as a prprocessing layer.
- [This hub page](https://www.tensorflow.org/hub/common_saved_model_apis/text) has some info: does not help much, as the info is for individual instance and the model is very different from what I want.
- [Fine-tuning a BERT model](https://www.tensorflow.org/text/tutorials/fine_tune_bert) tutorial: Here the dataset is tokenized, packed, and them used as input to model. Try it and see.
- Also check the [preprocess model page](https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3) for syntax.

### _Get tokenizer to work_

In [None]:
# Original example
#tok = bert_preprocess.tokenize(tf.constant(text_test))
#text_preprocessed = bert_preprocess.bert_pack_inputs([tok, tok], 
#                                                     tf.constant(max_length))

In [None]:
tokenizer  = hub.KerasLayer(bert_preprocess.tokenize)

In [None]:
sentences1 = tf.constant(["hello tensorflow"])
sentences2 = tf.constant(["goodbye tensorflow"])
tokens1    = tokenizer(sentences1)
tokens2    = tokenizer(sentences2)
tokens1, tokens2

In [None]:
# The following throw an error:
# AttributeError: 'KerasLayer' object has no attribute 'get_special_tokens_dict'
# Not sure how important this is...
#special = tokenizer.get_special_tokens_dict

# Ok, in the "Custom input packing and MLM support" section of this page:
# https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3
# There is this line:
special = bert_preprocess.tokenize.get_special_tokens_dict()
special

### _Get Packer to work_

In [None]:
packer = tfm.nlp.layers.BertPackInputs(
    seq_length=max_length,
    special_tokens_dict = special)

In [None]:
packed = packer([tokens1, tokens2])
for key, tensor in packed.items():
  print(f"{key:15s}: {tensor.shape}, {tensor[:, :10]}")

### _Combine tokenizer and packer into a layer_

In [None]:
class BertInputProcessor(tf.keras.layers.Layer):
  def __init__(self, tokenizer, packer):
    super().__init__()
    self.tokenizer = tokenizer
    self.packer = packer

  def call(self, inputs):
    # Original code is expecting two features, but I only have one.
    #tok1 = self.tokenizer(inputs['sentence1'])
    #tok2 = self.tokenizer(inputs['sentence2'])
    tok = self.tokenizer(inputs['txt'])

    packed = self.packer([tok, tok])

    if 'label' in inputs:
      return packed, inputs['label']
    else:
      return packed

In [None]:
bert_inputs_processor = BertInputProcessor(tokenizer, packer)

In [None]:
train_ds

In [None]:
train_packed, _ = train_ds.map(bert_inputs_processor).prefetch(1)

In [None]:
def build_classifier_model():
  # Input layer
  text_input        = tf.keras.layers.Input(shape=(), dtype=tf.string, 
                                            name='txt')

  # Will this work??  
  tokenizer         = hub.KerasLayer(bert_preprocess.tokenize)
  special           = bert_preprocess.tokenize.get_special_tokens_dict()
  tokenizer_outputs = tokenizer(text_input)

  # Processing layer: This has the key change to allow longer texts.
  packer            = tfm.nlp.layers.BertPackInputs(
                      seq_length=max_length,
                      special_tokens_dict = special)

  #preproc_layer     = hub.KerasLayer(bert_preprocess.bert_pack_inputs, 
  #                                   arguments=dict(seq_lenght=max_length),
  #                                   name='preprocessing')
                                     
  packer_outputs   = packer(tokenizer_outputs)

  # Initialize encoder
  encoder           = hub.KerasLayer(tfhub_encoder, trainable=True, 
                                   name='BERT_encoder')
  encoder_outputs   = encoder(packer_outputs)
  
  # Q: Wonder if this is the dense layer mentioned above.
  print(type(encoder_outputs))

  # Get just the embeddings for each doc (ignore token level info)
  net            = encoder_outputs['pooled_output']

  # Dropout layer
  net            = tf.keras.layers.Dropout(0.1)(net)

  # output layer: single node, Q: Why??
  net            = tf.keras.layers.Dense(2, activation='softmax', 
                                                        name='classifier')(net)
  return tf.keras.Model(text_input, net)

In [None]:
classifier_model = build_classifier_model()

text_test        = ['Plant science focuses on studies of photosynthetic species.']

# tf.constant: create a Tensor from tensor like objects
tensor_test      = tf.constant(text_test)
bert_raw_result  = classifier_model(tensor_test)

print("Raw result   :", bert_raw_result)

In [None]:
classifier_model.summary()

## __Model training__

### _Compile model_

In [None]:
epochs           = 20
cardinality      = tf.data.experimental.cardinality(train_ds)
steps_per_epoch  = cardinality.numpy()
num_train_steps  = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

# loss function: 
loss    = tf.keras.losses.SparseCategoricalCrossentropy()

# evaluation metrics
metrics = tf.metrics.BinaryAccuracy()

# Initial learning rate
init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [None]:
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

### _Train model_

In [None]:
# Specify callbacks

# early stopping
callback_es  = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

# model checkpoint
cp_filepath  = work_dir / "model_ori_bert_tf_pubmed"
callback_mcp = tf.keras.callbacks.ModelCheckpoint(filepath=str(cp_filepath), 
              mode='max', save_weights_only=False, monitor='val_accuracy', 
              save_best_only=True)

In [None]:
print(f'Training model with {tfhub_encoder}')
history = classifier_model.fit(x=train_ds, 
                               batch_size=2, 
                               epochs=epochs,
                               validation_data=valid_ds, 
                               verbose=1,
                               callbacks=[callback_es, callback_mcp])

### Evaluate the model

Let's see how the model performs. Two values will be returned. Loss (a number which represents the error, lower values are better), and accuracy.

In [None]:
loss, accuracy = classifier_model.evaluate(test_ds)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

### Plot the accuracy and loss over time

Based on the `History` object returned by `model.fit()`. You can plot the training and validation loss for comparison, as well as the training and validation accuracy:

In [None]:
history_dict = history.history
print(history_dict.keys())

acc = history_dict['binary_accuracy']
val_acc = history_dict['val_binary_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)
fig = plt.figure(figsize=(10, 6))
fig.tight_layout()

plt.subplot(2, 1, 1)
# r is for "solid red line"
plt.plot(epochs, loss, 'r', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
# plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(2, 1, 2)
plt.plot(epochs, acc, 'r', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

In this plot, the red lines represent the training loss and accuracy, and the blue lines are the validation loss and accuracy.

## Export for inference

Now you just save your fine-tuned model for later use.

In [None]:
dataset_name = 'imdb'
saved_model_path = './{}_bert'.format(dataset_name.replace('/', '_'))

classifier_model.save(saved_model_path, include_optimizer=False)

Let's reload the model, so you can try it side by side with the model that is still in memory.

In [None]:
reloaded_model = tf.saved_model.load(saved_model_path)

Here you can test your model on any sentence you want, just add to the examples variable below.

In [None]:
def print_my_examples(inputs, results):
  result_for_printing = \
    [f'input: {inputs[i]:<30} : score: {results[i][0]:.6f}'
                         for i in range(len(inputs))]
  print(*result_for_printing, sep='\n')
  print()


examples = [
    'this is such an amazing movie!',  # this is the same sentence tried earlier
    'The movie was great!',
    'The movie was meh.',
    'The movie was okish.',
    'The movie was terrible...'
]

reloaded_results = tf.sigmoid(reloaded_model(tf.constant(examples)))
original_results = tf.sigmoid(classifier_model(tf.constant(examples)))

print('Results from the saved model:')
print_my_examples(examples, reloaded_results)
print('Results from the model in memory:')
print_my_examples(examples, original_results)

If you want to use your model on [TF Serving](https://www.tensorflow.org/tfx/guide/serving), remember that it will call your SavedModel through one of its named signatures. In Python, you can test them as follows:

In [None]:
serving_results = reloaded_model \
            .signatures['serving_default'](tf.constant(examples))

serving_results = tf.sigmoid(serving_results['classifier'])

print_my_examples(examples, serving_results)

# __FAILED STEPS__


## _[Gave up on this] Convert training dataframe to dataset_

Try many different ways, was trying not to slit text records into different files then load as a dataset like the tutorial. But converting from dataframe always missed something that I cannot quite put my finger on. Moving on. Revisit this later...

- See the [pd_dataframe_to_tf_dataset](https://www.tensorflow.org/decision_forests/api_docs/python/tfdf/keras/pd_dataframe_to_tf_dataset) function, but this needs tf 2.9, conflict with tensorflow_text.
- See [this](https://www.tensorflow.org/tutorials/load_data/pandas_dataframe): See the shuffle and batch functions. Does not work...
- See [this post](https://medium.com/when-i-work-data/converting-a-pandas-dataframe-into-a-tensorflow-dataset-752f3783c168):  Was able to create SicedDataset, then BatchDatabase after applying the batch function, then PrefetchDataset. But trying to retreive a test example from trainin dataset lead to:
  - InvalidArgumentError: Index out of range using input dim 0; input has only 0 dims [Op:StridedSlice] name: strided_slice/
  - Ok, as I was implmenting the next solution, realize that I did not call the right obj for prefetch. Can be the reason why.
- Ah, see [this post](https://stackoverflow.com/questions/58461609/how-to-convert-pandas-dataframe-to-tensorflow-dataset): key is to turn train_data to dictionary before calling from_tensor_slices.
  - A little comment below say need to do .to_dict() instead which make sense. Because if just do dict(train), the thing finish in 0.1 sec which does not make sense. But this fails and throw:
    - ValueError: Unbatching a tensor is only supported for rank >= 1
  - Found [this post](https://stackoverflow.com/questions/55560620/valueerror-unbatching-a-tensor-is-only-supported-for-rank-1): Now try to uses this syntax. Still does not work...

#### The following DOES NOT work

```Python
raw_train_ds = (tf.data.Dataset.from_tensor_slices(
        (tf.cast(train['txt'].values, tf.string),
         tf.cast(train['label'].values, tf.int32),)))
raw_train_ds = tf.data.Dataset.from_tensor_slices(train)
raw_train_ds = tf.data.Dataset.from_tensor_slices(dict(train))
raw_train_ds = tf.data.Dataset.from_tensor_slices(train.to_dict())
raw_train_ds = tf.data.Dataset.from_tensor_slices((y_train,X_train))
raw_train_ds = tf.data.Dataset.from_tensor_slices((i_train, 
                                                   y_train.values,
                                                   X_train.values))

The last one is almost working, but at later stage when I try to pack the input, it breaks. Upon closer examination, the train_ds looks like:

```
<PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>
```

But what I need is more like:

```
<PrefetchDataset element_spec={'idx': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'label': TensorSpec(shape=(None,), dtype=tf.int64, name=None), 'sentence1': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'sentence2': TensorSpec(shape=(None,), dtype=tf.string, name=None)}>
```

In [None]:
trial = train.iloc[:3,:]
raw_trial_ds = (tf.data.Dataset.from_tensor_slices(
        (tf.cast(trial['txt'].values, tf.string),
         tf.cast(trial['label'].values, tf.int32),)))
raw_trial_ds
# Does not work

In [None]:
# Call "trial" so it does not mix with the testing set
trial_dict   = {"idx":[123,322], "label":[0,1], 
               "txt":["The 1st sentence", "The second"]}

# This creates a TensorSliceDataset
raw_trial_ds =  tf.data.Dataset.from_tensor_slices(trial_dict)

# Now this has the right structure!!!
raw_trial_ds

In [None]:
for index_batch, label_batch, text_batch in raw_trial_ds.take(1):
  for i in range(1):
    label = label_batch.numpy()[i]
    print(f'Label : {label} ({class_names[label]})')
    print(f'Review: {text_batch.numpy()[i]}')


In [None]:
raw_trial_ds_batch = raw_trial_ds.batch(batch_size)
type(raw_trial_ds_batch)

In [None]:
trial_ds = raw_trial_ds_batch.cache().prefetch(buffer_size=AUTOTUNE)
len(trial_ds)

In [None]:
# Still have the right structure.
trial_ds.element_spec['txt']

In [None]:
take1 = trial_ds.take(1)
help(take1)
    

#### Convert train, valid, and test sets

In [None]:
def dataframe_to_dataset(df):
  '''Convert DataFrame to PrefetchDataset
  Args:
    df (dataframe): with two columns ('txt', and 'label') and indices
  Return:
    dataset (PrefetchDataset): with:
      <PrefetchDataset element_spec={'idx': TensorSpec(shape=(None,), 
                                            dtype=tf.int32, name=None), 
                                     'label': TensorSpec(shape=(None,), 
                                            dtype=tf.int32, name=None), 
                                     'txt': TensorSpec(shape=(None,), 
                                            dtype=tf.string, name=None)}>

  '''
  idx = df.index   # aded this, as the tutorial has an index input
  X   = df['txt']
  y   = df['label']

  df_dict = {"idx":idx, "label":y.values, "txt":X.values}

  raw_ds       = tf.data.Dataset.from_tensor_slices(df_dict)  
  raw_ds_batch = raw_ds.batch(batch_size)
  dataset      = raw_ds_batch.cache().prefetch(buffer_size=AUTOTUNE)
  
  return dataset

In [None]:
# Convert train data
train_ds = dataframe_to_dataset(train)
train_ds

In [None]:
# Get validation dataset
valid_ds = dataframe_to_dataset(valid)

# Get testing dataset
test_ds = dataframe_to_dataset(test)

### _Testing_

In [None]:
train_ds.take(1)

In [None]:
for text_batch, label_batch in train_ds.take(1):
  print(text_batch[0])
  print(len(text_batch))
  print(label_batch)
  print(len(label_batch))

In [None]:
for text_batch, label_batch in train_ds.take(1):
  for i in range(1):
    print(f'Review: {text_batch.numpy()[i]}')
    label = label_batch.numpy()[i]
    print(f'Label : {label} ({class_names[label]})')