# __Step 2c: BERT models - scibert__

Follow:
- https://www.kaggle.com/code/gcspkmdr/scibert-wrapped-in-tf2/notebook
- https://github.com/lordtt13/word-embeddings/blob/master/COVID-19%20Research%20Data/COVID-SciBERT.ipynb

## __Setup__

In [1]:
# Install pytorch for working with SciBERT
# %conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch

### _Imports_

In [2]:
'''
For building text classification model based on embedding of Word2Vec and BERT
'''

## for data
#import argparse
import os
import json
import pandas as pd
import numpy as np
import pickle
import sys
import itertools
from pathlib import Path

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

import transformers
from datasets import Dataset

import tensorflow as tf


  from .autonotebook import tqdm as notebook_tqdm


### _Functions_

In [3]:
def read_configs(config_file):
  """Read configuration file and return a config_dict"""
  # required
  config_dict = {'lang_model':0,
                 'proj_dir':0,
                 'work_dir':0,
                 'corpus_combo_file':0,
                 'rand_state':0,
                 'bert_param':0,}

  # Read config file and fill in the dictionary
  with open(config_file, 'r') as f:
    configs     = f.readlines()
    for config in configs:
      if config.strip() == "" or config[0] == "#":
        pass
      else:
        config = config.strip().split("=")
        if config[0] in config_dict:
          config_dict[config[0]] = eval(config[1])

  # Check if any config missing
  missing = 0
  for config in config_dict:
    if config_dict[config] == 0:
      print("  missing:", config)
      missing += 1
    else:
      print("  ", config, "=", config_dict[config])

  if missing == 0:
    print("  all config available")
  else:
    print("  missing config, QUIT!")
    sys.exit(0)

  return config_dict


def split_train_validate_test(corpus_combo_file, rand_state):
  '''Load data and split train, validation, test subsets for the cleaned texts
  Args:
    corpus_combo_file (str): path to the json data file
    rand_state (int): for reproducibility
  Return:
    train, valid, test (pandas dataframes): training, validation, testing sets
  '''
  # Load json file
  with corpus_combo_file.open("r+") as f:
      corpus_combo_json = json.load(f)

  # Convert json back to dataframe
  corpus_combo = pd.read_json(corpus_combo_json)

  corpus = corpus_combo[['label','txt','txt_clean']]

  # Split train test
  train, test = train_test_split(corpus, 
      test_size=0.2, stratify=corpus['label'], random_state=rand_state)

  # Split train validate
  train, valid = train_test_split(train, 
      test_size=0.25, stratify=train['label'], random_state=rand_state)

  print(f"    train={train.shape}, valid={valid.shape}," +\
        f" test={test.shape}")

  return [train, valid, test]
  


### _Get training/testing split_

In [4]:
config_file = Path("config_bert.txt")

print("\nRead configuration file...")
config_dict = read_configs(config_file)

# Set up working directory and corpus file location
proj_dir          = Path(config_dict['proj_dir'])
work_dir          = proj_dir / config_dict['work_dir']
corpus_combo_file = work_dir / config_dict['corpus_combo_file']

os.chdir(work_dir)

# For reproducibility
rand_state = config_dict['rand_state']


Read configuration file...
   lang_model = bert
   proj_dir = /home/shius/projects/plant_sci_hist
   work_dir = 2_text_classify
   corpus_combo_file = corpus_combo
   rand_state = 20220609
   bert_param = {}
  all config available


In [5]:
# Split train/validate/test for cleaned text
#   Will not focus on original due to issues with non-alphanumeric characters
#   and stop words.
print("\nRead file and split train/validate/test...")
[train, valid, test] = split_train_validate_test(corpus_combo_file, rand_state)


Read file and split train/validate/test...
    train=(51987, 3), valid=(17329, 3), test=(17330, 3)


In [6]:
# Convert dataframes to Datasets
dataset_train = Dataset.from_pandas(train)
dataset_valid = Dataset.from_pandas(valid)
dataset_test  = Dataset.from_pandas(test)

## __Set up SciBERT__

### _Get SciBERT model_

In [7]:
'''
!wget https://s3-us-west-2.amazonaws.com/ai2-s2-research/scibert/tensorflow_models/scibert_scivocab_uncased.tar.gz
!tar -xvf ./scibert_scivocab_uncased.tar.gz
os.environ["WANDB_API_KEY"] = "0" ## to silence warning
!transformers-cli convert --model_type bert \
  --tf_checkpoint './scibert_scivocab_uncased/bert_model.ckpt' \
  --config './scibert_scivocab_uncased/bert_config.json' \
  --pytorch_dump_output './scibert_scivocab_uncased/pytorch_model.bin'
'''



In [8]:
# model parameters
BATCH_SIZE      = 8
TEST_BATCH_SIZE = 8
NR_EPOCHS       = 1
MAX_LEN         = 512 # try diffrent lengths
threshold       = 0.4
bert_model_name = './scibert_scivocab_uncased'
config          = transformers.BertConfig.from_json_file(
                                './scibert_scivocab_uncased/bert_config.json')

# Missed this line earlier and the loaded model has the outputs so later on when
# I am trying to put the layers together, there was an error.
config.output_hidden_states = False

In [9]:
scibert = transformers.TFBertModel.from_pretrained(bert_model_name, 
                        from_pt=True, config = config)

2022-06-17 14:52:27.616836: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:922] could not open file to read NUMA node: /sys/bus/pci/devices/0000:09:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-06-17 14:52:27.621103: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:922] could not open file to read NUMA node: /sys/bus/pci/devices/0000:09:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-06-17 14:52:27.621716: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:922] could not open file to read NUMA node: /sys/bus/pci/devices/0000:09:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-06-17 14:52:27.624055: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate

In [10]:
scibert.summary()

Model: "tf_bert_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109918464 
                                                                 
Total params: 109,918,464
Trainable params: 109,918,464
Non-trainable params: 0
_________________________________________________________________


In [11]:
'''
# From:
# https://www.kaggle.com/code/gcspkmdr/scibert-wrapped-in-tf2/notebook
class BertClassifier(tf.keras.Model):        
  def __init__(self, bert: TFBertModel, num_classes: int):
      
      super().__init__()
      
      self.bert = bert
      
      self.classifier = Dense(num_classes, activation='sigmoid')
      
  def call(self, input_ids, attention_mask=None, token_type_ids=None, 
           position_ids=None, head_mask=None):
      
      outputs = self.bert(input_ids,
                          attention_mask=attention_mask,
                          token_type_ids=token_type_ids,
                          position_ids=position_ids,
                          head_mask=head_mask)
      
      cls_output = outputs[1]
      
      cls_output = self.classifier(cls_output)
              
      return cls_output

model = transformers.BertClassifier(scibert, 2)
'''

"\n# From:\n# https://www.kaggle.com/code/gcspkmdr/scibert-wrapped-in-tf2/notebook\nclass BertClassifier(tf.keras.Model):        \n  def __init__(self, bert: TFBertModel, num_classes: int):\n      \n      super().__init__()\n      \n      self.bert = bert\n      \n      self.classifier = Dense(num_classes, activation='sigmoid')\n      \n  def call(self, input_ids, attention_mask=None, token_type_ids=None, \n           position_ids=None, head_mask=None):\n      \n      outputs = self.bert(input_ids,\n                          attention_mask=attention_mask,\n                          token_type_ids=token_type_ids,\n                          position_ids=position_ids,\n                          head_mask=head_mask)\n      \n      cls_output = outputs[1]\n      \n      cls_output = self.classifier(cls_output)\n              \n      return cls_output\n\nmodel = transformers.BertClassifier(scibert, 2)\n"

### _Get tokenizer_

In [12]:
old_tokenizer = transformers.AutoTokenizer.from_pretrained(
                                          bert_model_name, do_lower_case=True,
                                          config=config)

### _Get a list of list of texts_

In [13]:
# Write training texts to a folder where each file has 5000 entries.
corpus_train_path = work_dir / "corpus_train"
corpus_train_path.mkdir(parents=True, exist_ok=True)

# Note that I use the original text for training tokenizer
txts  = train['txt'].values

### _Train tokeinzier_

- https://huggingface.co/course/chapter6/2?fw=pt

In [14]:
# Parameters
vocab_size = 52000
# maximum sequence length, lowering will result to faster training (when 
# increasing batch size)
max_length = 512
min_frequency=2

# take iterator of sequences
tokenizer = old_tokenizer.train_new_from_iterator(txts, vocab_size)






In [15]:
len(old_tokenizer), len(tokenizer)

(31090, 52000)

In [16]:
# Save tokenizer and reload
model_path = work_dir / "model_cln_bert_scibert"
model_path.mkdir(parents=True, exist_ok=True)

# save the tokenizer  
tokenizer.save_pretrained(str(model_path))

# This step is critical: the trained tokenizer object cannot be called directly.
tokenizer_loaded = transformers.BertTokenizerFast.from_pretrained(model_path)


## __Set up model__

For scibert:
- https://analyticsindiamag.com/guide-to-scibert-a-pre-trained-bert-based-language-model-for-scientific-text/
- https://www.kaggle.com/code/gcspkmdr/scibert-wrapped-in-tf2/notebook


In [17]:
## inputs
idx   = tf.keras.layers.Input((max_length), dtype="int32", name="input_idx")
masks = tf.keras.layers.Input((max_length), dtype="int32", name="input_masks")

In [18]:
bert_out = scibert(idx, attention_mask=masks)[0]
bert_out

<KerasTensor: shape=(None, 512, 768) dtype=float32 (created by layer 'tf_bert_model')>

In [19]:
## set up additional layers for fine-tuning
x     = tf.keras.layers.GlobalAveragePooling1D()(bert_out)
x     = tf.keras.layers.Dense(64, activation="relu")(x)
y_out = tf.keras.layers.Dense(2, activation='softmax')(x)

In [20]:
## compile
model = tf.keras.models.Model([idx, masks], y_out)
for layer in model.layers[:3]:
    layer.trainable = False

In [21]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)
loss      ='sparse_categorical_crossentropy'
metrics   =['accuracy']
model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
              
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_idx (InputLayer)         [(None, 512)]        0           []                               
                                                                                                  
 input_masks (InputLayer)       [(None, 512)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  109918464   ['input_idx[0][0]',              
                                thPoolingAndCrossAt               'input_masks[0][0]']            
                                tentions(last_hidde                                               
                                n_state=(None, 512,                                           

## __Text classification with transfer learning__

### _Set up train, valid, and test data_

In [22]:
# The pre-processed text data is used for encoding purpose. From dataset data
# type, the returned object from here are lists.
X_train = dataset_train['txt_clean']
X_valid = dataset_valid['txt_clean']
X_test  = dataset_test['txt_clean']
type(dataset_train), type(X_train)

(datasets.arrow_dataset.Dataset, list)

In [23]:
# Set up labels: cannot get these from dataset type since list cannot be used
# to store labels for the model.fit function below. Instead, get them from
# the original dataframe
y_train = train['label']
y_valid = valid['label']
y_test  = test['label']
type(train), type(y_train)

(pandas.core.frame.DataFrame, pandas.core.series.Series)

### _Encode corpus_

In [24]:
# Define function to encode text data in batches
def batch_encode(tokenizer, texts, batch_size=256):
  """""""""
  A function that encodes a batch of texts and returns the texts'
  corresponding encodings and attention masks that are ready to be fed 
  into a pre-trained transformer model.
  
  Input:
  - tokenizer:   Tokenizer object from the PreTrainedTokenizer Class
  - texts:       List of strings where each string represents a text
  - batch_size:  Integer controlling number of texts in a batch
  - max_length:  Integer controlling max number of words to tokenize in a
    given text
  Output:
  - input_ids:       sequence of texts encoded as a tf.Tensor object
  - attention_mask: the texts' attention mask encoded as a tf.Tensor obj
  """""""""
  # Define the maximum number of words to tokenize (up to 512)
  max_length = 512
  input_ids = []
  attention_mask = []
  
  for i in range(0, len(texts), batch_size):
    batch = texts[i:i+batch_size]
    inputs = tokenizer.batch_encode_plus(batch,
                                          max_length=max_length,
                                          padding='max_length',
                                          truncation=True,
                                          return_attention_mask=True,
                                          return_token_type_ids=False
                                          )
    input_ids.extend(inputs['input_ids'])
    attention_mask.extend(inputs['attention_mask'])
  
  return tf.convert_to_tensor(input_ids), tf.convert_to_tensor(attention_mask)

In [25]:
# Encode corpus
X_train_ids, X_train_attn = batch_encode(tokenizer_loaded, X_train)
X_valid_ids, X_valid_attn = batch_encode(tokenizer_loaded, X_valid)
X_test_ids , X_test_attn  = batch_encode(tokenizer_loaded, X_test)

X_train_bert = [np.asarray(X_train_ids, dtype='int32'),
                np.asarray(X_train_attn, dtype='int32')]
X_valid_bert = [np.asarray(X_valid_ids, dtype='int32'),
                np.asarray(X_valid_attn, dtype='int32')]
X_test_bert  = [np.asarray(X_test_ids, dtype='int32'),
                np.asarray(X_test_attn, dtype='int32')]

In [26]:
## The following is done in the original tutorial because the classes are not
## in numbers. So this can be skipped.
# encode y
#dic_y_mapping = {n:label for n,label in enumerate(np.unique(y_train))}
#inverse_dic   = {v:k for k,v in dic_y_mapping.items()}
#y_train_label = np.array([inverse_dic[y] for y in y_train])

In [27]:
# setup callbacks
callback_es  = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
callback_mcp = tf.keras.callbacks.ModelCheckpoint(filepath=model_path, 
                  mode='max', save_weights_only=False, monitor='val_accuracy', 
                  save_best_only=True)

In [29]:
## train
history = model.fit(x=X_train_bert, y=y_train, batch_size=128, 
                     epochs=20, shuffle=True, verbose=1,
                     validation_data=(X_valid_bert, y_valid),
                     callbacks=[callback_es, callback_mcp])

Epoch 1/20

2022-06-17 15:06:42.806207: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: /home/shius/projects/plant_sci_hist/2_text_classify/model_cln_bert_scibert/assets


INFO:tensorflow:Assets written to: /home/shius/projects/plant_sci_hist/2_text_classify/model_cln_bert_scibert/assets


Epoch 2/20



INFO:tensorflow:Assets written to: /home/shius/projects/plant_sci_hist/2_text_classify/model_cln_bert_scibert/assets


INFO:tensorflow:Assets written to: /home/shius/projects/plant_sci_hist/2_text_classify/model_cln_bert_scibert/assets


Epoch 3/20



INFO:tensorflow:Assets written to: /home/shius/projects/plant_sci_hist/2_text_classify/model_cln_bert_scibert/assets


INFO:tensorflow:Assets written to: /home/shius/projects/plant_sci_hist/2_text_classify/model_cln_bert_scibert/assets


Epoch 4/20
Epoch 5/20



INFO:tensorflow:Assets written to: /home/shius/projects/plant_sci_hist/2_text_classify/model_cln_bert_scibert/assets


INFO:tensorflow:Assets written to: /home/shius/projects/plant_sci_hist/2_text_classify/model_cln_bert_scibert/assets


Epoch 6/20
Epoch 7/20



INFO:tensorflow:Assets written to: /home/shius/projects/plant_sci_hist/2_text_classify/model_cln_bert_scibert/assets


INFO:tensorflow:Assets written to: /home/shius/projects/plant_sci_hist/2_text_classify/model_cln_bert_scibert/assets


Epoch 8/20



INFO:tensorflow:Assets written to: /home/shius/projects/plant_sci_hist/2_text_classify/model_cln_bert_scibert/assets


INFO:tensorflow:Assets written to: /home/shius/projects/plant_sci_hist/2_text_classify/model_cln_bert_scibert/assets


Epoch 9/20
Epoch 10/20



INFO:tensorflow:Assets written to: /home/shius/projects/plant_sci_hist/2_text_classify/model_cln_bert_scibert/assets


INFO:tensorflow:Assets written to: /home/shius/projects/plant_sci_hist/2_text_classify/model_cln_bert_scibert/assets


Epoch 11/20



INFO:tensorflow:Assets written to: /home/shius/projects/plant_sci_hist/2_text_classify/model_cln_bert_scibert/assets


INFO:tensorflow:Assets written to: /home/shius/projects/plant_sci_hist/2_text_classify/model_cln_bert_scibert/assets


Epoch 12/20
Epoch 13/20



INFO:tensorflow:Assets written to: /home/shius/projects/plant_sci_hist/2_text_classify/model_cln_bert_scibert/assets


INFO:tensorflow:Assets written to: /home/shius/projects/plant_sci_hist/2_text_classify/model_cln_bert_scibert/assets


Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20


In [30]:
# Load model
model_loaded = tf.keras.models.Model([idx, masks], y_out)
for layer in model.layers[:3]:
    layer.trainable = False

optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)
loss      ='sparse_categorical_crossentropy'
metrics   =['accuracy']
model_loaded.compile(loss=loss, optimizer=optimizer, metrics=metrics)

model_loaded.load_weights(model_path)

2022-06-18 09:07:18.704842: W tensorflow/core/util/tensor_slice_reader.cc:96] Could not open /home/shius/projects/plant_sci_hist/2_text_classify/model_cln_bert_scibert: FAILED_PRECONDITION: /home/shius/projects/plant_sci_hist/2_text_classify/model_cln_bert_scibert; Is a directory: perhaps your file is in a different file format and you need to use a different restore operator?


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f91c898d240>

#### _Get validation f1_

In [31]:
y_valid_pred_prob = model_loaded.predict(X_valid_bert)

In [32]:
dic_y_mapping = {n:label for n,label in enumerate(np.unique(y_valid))}
y_valid_pred  = [dic_y_mapping[np.argmax(pred)] for pred in y_valid_pred_prob]

In [33]:
valid_score = f1_score(y_valid, y_valid_pred)
print(valid_score)

0.8736775163323119


#### _Evaluate model with test set_

In [34]:
y_test_pred_prob = model.predict(X_test_bert)
y_test_pred = [dic_y_mapping[np.argmax(pred)] for pred in y_test_pred_prob]
test_score  = f1_score(y_test, y_test_pred)
print(test_score)

0.8715057825303656
