[Original Post](https://medium.com/swlh/fine-tuning-bert-for-text-classification-and-question-answering-using-tensorflow-framework-4d09daeb3330#id_token=eyJhbGciOiJSUzI1NiIsImtpZCI6IjE4MmU0NTBhMzVhMjA4MWZhYTFkOWFlMWQyZDc1YTBmMjNkOTFkZjgiLCJ0eXAiOiJKV1QifQ.eyJpc3MiOiJodHRwczovL2FjY291bnRzLmdvb2dsZS5jb20iLCJuYmYiOjE2NDQyOTkwOTAsImF1ZCI6IjIxNjI5NjAzNTgzNC1rMWs2cWUwNjBzMnRwMmEyamFtNGxqZGNtczAwc3R0Zy5hcHBzLmdvb2dsZXVzZXJjb250ZW50LmNvbSIsInN1YiI6IjExMjk2MzczMjI5MjcwMzU1ODc4OCIsImVtYWlsIjoic3RldmVudnVvbmc5NkBnbWFpbC5jb20iLCJlbWFpbF92ZXJpZmllZCI6dHJ1ZSwiYXpwIjoiMjE2Mjk2MDM1ODM0LWsxazZxZTA2MHMydHAyYTJqYW00bGpkY21zMDBzdHRnLmFwcHMuZ29vZ2xldXNlcmNvbnRlbnQuY29tIiwibmFtZSI6IlN0ZXZlbiBWdW9uZyIsInBpY3R1cmUiOiJodHRwczovL2xoMy5nb29nbGV1c2VyY29udGVudC5jb20vYS9BQVRYQUp3N0phNE5VU1hvOW9lMkpiRlp1cENMdkxsam8yRmFKRUxTOTlNWT1zOTYtYyIsImdpdmVuX25hbWUiOiJTdGV2ZW4iLCJmYW1pbHlfbmFtZSI6IlZ1b25nIiwiaWF0IjoxNjQ0Mjk5MzkwLCJleHAiOjE2NDQzMDI5OTAsImp0aSI6ImI3ODRmMDJkZmM5ZTNhMmRhZGU4YTNhYWQ2NGUwYjJkMzNjYzIwMmYifQ.hAbZE_wmOwt8TTBPQk-8lH1Ll6BVtLOmKo_QQHZ2Dg2Ha0vjzoYyfEFO-O0Pco57G8Exz_QViLvl8OkEkBHNo-3sGp2NhYxsxQW_zeACtv3Z063vve2AGw_zX2kG7TNYL4vS5kQ8QrLjgpf6sT4gR96f-07aC-yqC-KdDJgGwe-iKcR7ZrwGIpUNXLrxKAahp2jo7jRXQJPvKpqEFwmLCJHYMhj_imWmupR2zG9Y2j69O26s9ipRWa0w7_miMPIM2gqbScICGbpj3lai1A619xkSnjgIoM_mUhEZeCEupH8Bk_ZFBjzbWGfNjrWHVQyC9ASzDSQMjR8J7ZzJao9EOg)

In [None]:
!pip install tf-models-official

In [None]:
import os
import numpy as np
import official.nlp.optimization
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_hub as hub
from official import nlp
from official.nlp.bert import tokenization

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
!nvidia-smi

In [None]:
## Defining functions to preprocess our data

def encode_sentence(sentence, tokenizer):

    # tokenise sentence
    tokens = list(tokenizer.tokenize(sentence))

    # each sentence should end with a [SEP] (separator) token
    tokens.append('[SEP]')

    # then convert tokens to ids
    return tokenizer.convert_tokens_to_ids(tokens)


def bert_encode(glue_dict, tokenizer, max_seq_len):

    # start by encoding all the sentences and packing them into ragged-tensors
    # ragged tensor can be created from nested python list
    sentence1 = tf.ragged.constant([encode_sentence(s, tokenizer) for s in np.array(glue_dict["sentence1"])])
    sentence2 = tf.ragged.constant([encode_sentence(s, tokenizer) for s in np.array(glue_dict["sentence2"])])

    # now prepend a [CLS] token, of length of num sentences and concatenate the ragged tensors 
    # to form a single input_word_ids tensor for each example. Remember each starts with CLS
    cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])] * sentence1.shape[0]
    input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)

    # the mask allows the model to cleanly differentiate between the content and the padding
    # it has the same shape as the input_word_ids, and contains a 1 anywhere the input_word_ids 
    input_mask = tf.ones_like(input_word_ids).to_tensor()
    type_cls = tf.zeros_like(cls)
    type_s1 = tf.zeros_like(sentence1)
    type_s2 = tf.ones_like(sentence2)

    # input_type_ids tensor also has the same shape, but inside the non-padded region 
    # it contains a 0 or a 1 indicating which sentence the token is a part of
    input_type_ids = tf.concat([type_cls, type_s1, type_s2], axis=-1).to_tensor()

    # zero pad all
    input_word_ids = tf.keras.preprocessing.sequence.pad_sequences(input_word_ids.to_tensor(), maxlen=max_seq_len, padding='post',truncating='post')
    input_mask = tf.keras.preprocessing.sequence.pad_sequences(input_mask, maxlen=max_seq_len, padding='post',truncating='post')
    input_type_ids = tf.keras.preprocessing.sequence.pad_sequences(input_type_ids, maxlen=max_seq_len, padding='post',truncating='post')

    # convert to tensor and return
    inputs = {
        'input_word_ids': tf.convert_to_tensor(input_word_ids),
        'input_mask': tf.convert_to_tensor(input_mask),
        'input_type_ids': tf.convert_to_tensor(input_type_ids)
        }
    return inputs

In [None]:
## Initialising our model

# define max sequence length
max_seq_length = 128

# initialise input word ids, mask and type ids
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_mask")
input_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')

# load bert layer and pass inputs to get the pooled layer
bert_inputs = {'input_word_ids': input_word_ids, 'input_mask': input_mask, 'input_type_ids': input_type_ids}
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2", trainable=True)
pooled_output, _ = bert_layer([input_word_ids, input_mask, input_type_ids])

# add dropout layer; and then output Dense layer with Truncated Norm Initializer
output = tf.keras.layers.Dropout(rate=0.2)(pooled_output)
initializer = tf.keras.initializers.TruncatedNormal(stddev=0.02)
bert_output = tf.keras.layers.Dense(2, kernel_initializer=initializer, name='output')(output)

# create model; set input expectation (dict) and output 
model = tf.keras.models.Model(inputs=bert_inputs, outputs=bert_output)
model.summary()

In [None]:
# Load tokenizer
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() # vocabulary file?
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy() # if lowercase
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case) # full tokenizer.. From VocabFile and lowercase (true)


In [None]:
# download dataset and apply bert encoding with our tokenizer
glue, info = tfds.load('glue/mrpc', with_info=True, batch_size=-1)
glue_train = bert_encode(glue['train'], tokenizer, max_seq_length)
glue_train_labels = glue['train']['label']

glue_validation = bert_encode(glue['validation'], tokenizer, max_seq_length)
glue_validation_labels = glue['validation']['label']

In [None]:
"""
## Sample breakdown of 'bert_encode' function; see step by step
sample_idx = 2

# load some sample sentences
sample_sentence_1 = glue['train']['sentence1'][sample_idx]
sample_sentence_2 = glue['train']['sentence2'][sample_idx]
sample_label = glue['train']['label'][sample_idx]

print('Sample Index:', glue['train']['idx'][sample_idx], '\n', 
      'Sample Label:', sample_label,'\n', 
      'Train Example:',sample_sentence_1, '\n',
      'Test Example:',sample_sentence_2, '\n'
      )

# split sample into individual tokens
sample_tokneized = list(tokenizer.tokenize(sample_sentence_1.numpy()))
print('Sample after tokenization', sample_tokneized[:5])

sample_tokneized.append('[SEP]')
print('Converting SEP to ID', tokenizer.convert_tokens_to_ids(['[SEP]']))
print('End of tokenized:', sample_tokneized[-5:])

# convert to ids
sample_tokenized_ids = tokenizer.convert_tokens_to_ids(sample_tokneized)
print('Tokenized and turn to ids', sample_tokenized_ids[:5])

# do for sent1 and 2; turn into ragged tensors of ids
sentence1 = tf.ragged.constant([encode_sentence(sample_sentence_1.numpy(), tokenizer)])
sentence2 = tf.ragged.constant([encode_sentence(sample_sentence_2.numpy(), tokenizer)])
print('\n', 'Ragged Tensor of sentence 2', sentence2)

# tokenize CLS then have as many as there are sentences (to split sentences)
cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])] * sentence1.shape[0]
print('\n', 'CLS token ID', cls)

# create input tensor; can see we start with CLS token, then two SEP; one inbetween and one at end
input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)
print('\n', 'Ragged tensior of word 1 and 2', input_word_ids)

# Create input mask, cls, s1 and s2 tensors.
input_mask = tf.ones_like(input_word_ids).to_tensor()

# create input type id's, concat all of the following
type_cls = tf.zeros_like(cls) # shape (n, 1) after CLS
type_s1 = tf.zeros_like(sentence1) # shape of (n, sentences1); label as 0's (input types)
type_s2 = tf.ones_like(sentence2) # shape of (n, sentences2); label this as 1's (input types)
input_type_ids = tf.concat([type_cls, type_s1, type_s2], axis=-1).to_tensor()
print('\n Input Type IDs: \n', input_type_ids) # shape of sentence 1

# pad all
input_word_ids = tf.keras.preprocessing.sequence.pad_sequences(input_word_ids.to_tensor(), maxlen=128, padding='post',truncating='post')
input_mask = tf.keras.preprocessing.sequence.pad_sequences(input_mask, maxlen=128, padding='post',truncating='post')
input_type_ids = tf.keras.preprocessing.sequence.pad_sequences(input_type_ids, maxlen=128, padding='post',truncating='post')

# create inputs dict
inputs = {
    'input_word_ids': input_word_ids,
    'input_mask': input_mask,
    'input_type_ids': input_type_ids
    }
print('\n BERT Inputs', inputs)
"""

In [None]:
## Compile model

epochs = 10

# define sizes
batch_size = 32
eval_batch_size = 32
train_data_size = len(glue_train_labels)

# define steps; include warmup
steps_per_epoch = int(train_data_size / batch_size)
num_train_steps = steps_per_epoch * epochs
warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)

# define optimizer; metrics and loss. Sparse Categorical CrossEntropy
optimizer = nlp.optimization.create_optimizer(2e-5, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy', dtype=tf.float32)]
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

model.fit(
    glue_train, glue_train_labels,
    validation_data= (glue_validation, glue_validation_labels),
    batch_size=batch_size,
    validation_batch_size=eval_batch_size,
    epochs=epochs
    )

In [None]:
# save weights
model.save_weights("./weights.h5")
# build some examples with our tokenizer
my_examples = bert_encode(
    glue_dict={
        'sentence1': [
            'The rain in Spain falls mainly on the plain.',
            'Look I fine tuned BERT.'],
        'sentence2': [
            'It mostly rains on the flat lands of Spain.',
            'Is it working? This does not match.']
    },
    tokenizer=tokenizer,
    max_seq_len=max_seq_length
    )

# get the result
result = model.predict(my_examples)
print(result)

result = tf.argmax(result).numpy()
print(result)
print(np.array(info.features['label'].names)[result])

In [None]:
info