# __Step 2c: BERT models - distillbert__

Follow:
- [How to train a BERT model from scratch](https://towardsdatascience.com/how-to-train-a-bert-model-from-scratch-72cfce554fc6)
- [Huggintface ByteLevelBPETokenizer tutorial](https://github.com/huggingface/blog/blob/main/how-to-train.md)

## __Setup__

In [1]:
# Install pytorch for working with SciBERT
%conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


### _Imports_

In [2]:
'''
For building text classification model based on embedding of Word2Vec and BERT
'''


## for data
#import argparse
import os
import json
import pandas as pd
import numpy as np
import pickle
import sys
import itertools
from pathlib import Path

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

import transformers
from datasets import Dataset
from tokenizers import BertWordPieceTokenizer

import tensorflow as tf


  from .autonotebook import tqdm as notebook_tqdm


### _Functions_

In [3]:
def read_configs(config_file):
  """Read configuration file and return a config_dict"""
  # required
  config_dict = {'lang_model':0,
                 'proj_dir':0,
                 'work_dir':0,
                 'corpus_combo_file':0,
                 'rand_state':0,
                 'bert_param':0,}

  # Read config file and fill in the dictionary
  with open(config_file, 'r') as f:
    configs     = f.readlines()
    for config in configs:
      if config.strip() == "" or config[0] == "#":
        pass
      else:
        config = config.strip().split("=")
        if config[0] in config_dict:
          config_dict[config[0]] = eval(config[1])

  # Check if any config missing
  missing = 0
  for config in config_dict:
    if config_dict[config] == 0:
      print("  missing:", config)
      missing += 1
    else:
      print("  ", config, "=", config_dict[config])

  if missing == 0:
    print("  all config available")
  else:
    print("  missing config, QUIT!")
    sys.exit(0)

  return config_dict


def split_train_validate_test(corpus_combo_file, rand_state):
  '''Load data and split train, validation, test subsets for the cleaned texts
  Args:
    corpus_combo_file (str): path to the json data file
    rand_state (int): for reproducibility
  Return:
    train, valid, test (pandas dataframes): training, validation, testing sets
  '''
  # Load json file
  with corpus_combo_file.open("r+") as f:
      corpus_combo_json = json.load(f)

  # Convert json back to dataframe
  corpus_combo = pd.read_json(corpus_combo_json)

  corpus = corpus_combo[['label','txt','txt_clean']]

  # Split train test
  train, test = train_test_split(corpus, 
      test_size=0.2, stratify=corpus['label'], random_state=rand_state)

  # Split train validate
  train, valid = train_test_split(train, 
      test_size=0.25, stratify=train['label'], random_state=rand_state)

  print(f"    train={train.shape}, valid={valid.shape}," +\
        f" test={test.shape}")

  return [train, valid, test]
  


### _Get training/testing split_

In [4]:
config_file = Path("config_bert.txt")

print("\nRead configuration file...")
config_dict = read_configs(config_file)

# Set up working directory and corpus file location
proj_dir          = Path(config_dict['proj_dir'])
work_dir          = proj_dir / config_dict['work_dir']
corpus_combo_file = work_dir / config_dict['corpus_combo_file']

os.chdir(work_dir)

# For reproducibility
rand_state = config_dict['rand_state']


Read configuration file...
   lang_model = bert
   proj_dir = /home/shius/projects/plant_sci_hist
   work_dir = 2_text_classify
   corpus_combo_file = corpus_combo
   rand_state = 20220609
   bert_param = {}
  all config available


In [5]:
# Split train/validate/test for cleaned text
#   Will not focus on original due to issues with non-alphanumeric characters
#   and stop words.
print("\nRead file and split train/validate/test...")
[train, valid, test] = split_train_validate_test(corpus_combo_file, rand_state)


Read file and split train/validate/test...
    train=(51987, 3), valid=(17329, 3), test=(17330, 3)


In [6]:
# Convert dataframes to Datasets
dataset_train = Dataset.from_pandas(train)
dataset_valid = Dataset.from_pandas(valid)
dataset_test  = Dataset.from_pandas(test)

In [7]:
dataset_train

Dataset({
    features: ['label', 'txt', 'txt_clean', '__index_level_0__'],
    num_rows: 51987
})

## __Tokenization__

### _Parameters_

In [8]:
# Parameters
vocab_size = 52_000

# maximum sequence length, lowering will result to faster training (when 
# increasing batch size)
max_length = 512

min_frequency=2

# whether to truncate
truncate_longer_samples = True

### _Write training texts to files_

In [9]:
# Write training texts to a folder where each file has 5000 entries.
corpus_train_path = work_dir / "corpus_train"
corpus_train_path.mkdir(parents=True, exist_ok=True)

# Note that I use the original text for training tokenizer
txts  = train['txt'].values

# list of training corpus files
files = []             
for idx in range(0,len(txts),5000):
  subset = txts[idx:idx+5000]
  subset_file = corpus_train_path / f"txt_{idx}"

  # force posix path to be string, otherwize the training step below will fail
  files.append(str(subset_file))
  with open(subset_file, "w") as f:
    subset_txts = '\n'.join(subset)
    f.write(subset_txts)

### _Train tokeinzier from scratch_

In [10]:
# Intialize and train tokenizer
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "<S>", "<T>"]

tokenizer = BertWordPieceTokenizer()

tokenizer.train(files=files, vocab_size=vocab_size, min_frequency=min_frequency, 
                special_tokens=special_tokens)

# enable truncation up to the maximum 512 tokens
tokenizer.enable_truncation(max_length=max_length)

model_path = work_dir / "model_cln_bert_test"
model_path.mkdir(parents=True, exist_ok=True)

# save the tokenizer  
tokenizer.save_model(str(model_path))

# dumping some of the tokenizer config to config file, 
# including special tokens, whether to lower case and the maximum sequence length
with open(model_path / "config.json", "w") as f:
    tokenizer_cfg = {
        "do_lower_case": True,
        "unk_token": "[UNK]",
        "sep_token": "[SEP]",
        "pad_token": "[PAD]",
        "cls_token": "[CLS]",
        "mask_token": "[MASK]",
        "model_max_length": max_length,
        "max_len": max_length,}
    json.dump(tokenizer_cfg, f)

# This step is critical: the trained tokenizer object cannot be called directly.
tokenizer_loaded = transformers.BertTokenizerFast.from_pretrained(model_path)







## __Set up model__

For scibert:
- https://analyticsindiamag.com/guide-to-scibert-a-pre-trained-bert-based-language-model-for-scientific-text/
- https://www.kaggle.com/code/gcspkmdr/scibert-wrapped-in-tf2/notebook


In [11]:
## inputs
idx   = tf.keras.layers.Input((max_length), dtype="int32", name="input_idx")
masks = tf.keras.layers.Input((max_length), dtype="int32", name="input_masks")

### _distilbert_

In [13]:
## get pre-trained bert with config
#  Below is for distilbert
pretrained_name = 'distilbert-base-uncased'
config = transformers.DistilBertConfig(dropout=0.2, attention_dropout=0.2)
config.output_hidden_states = False
pretrained = transformers.TFDistilBertModel.from_pretrained(pretrained_name,
                                                              config=config)

2022-06-17 13:37:25.642924: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
2022-06-17 13:37:27.293746: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['activation_13', 'vocab_projector', 'vocab_layer_norm', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassificati

In [15]:
bert_out = pretrained(idx, attention_mask=masks)[0]

In [16]:

## set up additional layers for fine-tuning
x     = tf.keras.layers.GlobalAveragePooling1D()(bert_out)
x     = tf.keras.layers.Dense(64, activation="relu")(x)
y_out = tf.keras.layers.Dense(2, activation='softmax')(x)

In [17]:

## compile
model = tf.keras.models.Model([idx, masks], y_out)
for layer in model.layers[:3]:
    layer.trainable = False


optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)
loss      ='sparse_categorical_crossentropy'
metrics   =['accuracy']
model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
              
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_idx (InputLayer)         [(None, 512)]        0           []                               
                                                                                                  
 input_masks (InputLayer)       [(None, 512)]        0           []                               
                                                                                                  
 tf_distil_bert_model (TFDistil  TFBaseModelOutput(l  66362880   ['input_idx[0][0]',              
 BertModel)                     ast_hidden_state=(N               'input_masks[0][0]']            
                                one, 512, 768),                                                   
                                 hidden_states=None                                           

## __Text classification with transfer learning__

### _Set up train, valid, and test data_

In [18]:
# The pre-processed text data is used for encoding purpose. From dataset data
# type, the returned object from here are lists.
X_train = dataset_train['txt_clean']
X_valid = dataset_valid['txt_clean']
X_test  = dataset_test['txt_clean']
type(dataset_train), type(X_train)

(datasets.arrow_dataset.Dataset, list)

In [19]:
# Set up labels: cannot get these from dataset type since list cannot be used
# to store labels for the model.fit function below. Instead, get them from
# the original dataframe
y_train = train['label']
y_valid = valid['label']
y_test  = test['label']
type(train), type(y_train)

(pandas.core.frame.DataFrame, pandas.core.series.Series)

### _Encode corpus_

In [20]:
# Define function to encode text data in batches
def batch_encode(tokenizer, texts, batch_size=256):
  """""""""
  A function that encodes a batch of texts and returns the texts'
  corresponding encodings and attention masks that are ready to be fed 
  into a pre-trained transformer model.
  
  Input:
  - tokenizer:   Tokenizer object from the PreTrainedTokenizer Class
  - texts:       List of strings where each string represents a text
  - batch_size:  Integer controlling number of texts in a batch
  - max_length:  Integer controlling max number of words to tokenize in a
    given text
  Output:
  - input_ids:       sequence of texts encoded as a tf.Tensor object
  - attention_mask: the texts' attention mask encoded as a tf.Tensor obj
  """""""""
  # Define the maximum number of words to tokenize (up to 512)
  max_length = 512
  input_ids = []
  attention_mask = []
  
  for i in range(0, len(texts), batch_size):
    batch = texts[i:i+batch_size]
    inputs = tokenizer.batch_encode_plus(batch,
                                          max_length=max_length,
                                          padding='max_length',
                                          truncation=True,
                                          return_attention_mask=True,
                                          return_token_type_ids=False
                                          )
    input_ids.extend(inputs['input_ids'])
    attention_mask.extend(inputs['attention_mask'])
  
  return tf.convert_to_tensor(input_ids), tf.convert_to_tensor(attention_mask)

In [21]:
# Encode corpus
X_train_ids, X_train_attn = batch_encode(tokenizer_loaded, X_train)
X_valid_ids, X_valid_attn = batch_encode(tokenizer_loaded, X_valid)
X_test_ids , X_test_attn  = batch_encode(tokenizer_loaded, X_test)

X_train_bert = [np.asarray(X_train_ids, dtype='int32'),
                np.asarray(X_train_attn, dtype='int32')]
X_valid_bert = [np.asarray(X_valid_ids, dtype='int32'),
                np.asarray(X_valid_attn, dtype='int32')]
X_test_bert  = [np.asarray(X_test_ids, dtype='int32'),
                np.asarray(X_test_attn, dtype='int32')]

In [22]:
## The following is done in the original tutorial because the classes are not
## in numbers. So this can be skipped.
# encode y
#dic_y_mapping = {n:label for n,label in enumerate(np.unique(y_train))}
#inverse_dic   = {v:k for k,v in dic_y_mapping.items()}
#y_train_label = np.array([inverse_dic[y] for y in y_train])

In [23]:
# setup output dir
cp_filepath = work_dir / "model_cln_bert_test"

# setup callbacks
callback_es  = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
callback_mcp = tf.keras.callbacks.ModelCheckpoint(filepath=cp_filepath, 
                  mode='max', save_weights_only=False, monitor='val_accuracy', 
                  save_best_only=True)

In [None]:
## train
history = model.fit(x=X_train_bert, y=y_train, batch_size=512, 
                     epochs=20, shuffle=True, verbose=1,
                     validation_data=(X_valid_bert, y_valid),
                     callbacks=[callback_es, callback_mcp])

In [None]:
model_loaded = tf.keras.models.Model([idx, masks], y_out)
for layer in model.layers[:3]:
    layer.trainable = False

optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)
loss      ='sparse_categorical_crossentropy'
metrics   =['accuracy']
model_loaded.compile(loss=loss, optimizer=optimizer, metrics=metrics)

model_loaded.load_weights(cp_filepath)

#### _Get validation f1_

In [None]:
y_valid_pred_prob = model_loaded.predict(X_valid_bert)

In [None]:
dic_y_mapping = {n:label for n,label in enumerate(np.unique(y_valid))}
y_valid_pred  = [dic_y_mapping[np.argmax(pred)] for pred in y_valid_pred_prob]

In [None]:
valid_score = f1_score(y_valid, y_valid_pred)
print(valid_score)

#### _Evaluate model with test set_

In [None]:
y_test_pred_prob = model.predict(X_test_bert)
y_test_pred = [dic_y_mapping[np.argmax(pred)] for pred in y_test_pred_prob]
test_score  = f1_score(y_test, y_test_pred)
print(test_score)