
This lab is adapted from the Tensorflow tutorial for  text classification with BERT

https://colab.research.google.com/github/tensorflow/text/blob/master/docs/tutorials/classify_text_with_bert.ipynb#scrollTo=EqL7ihkN_862


##### Copyright 2020 The TensorFlow Hub Authors.


In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Classify text with BERT

Steps

- Load a BERT model from TensorFlow Hub
- Install data
- Build a model by combining BERT with a classifier
- Fine-tune BERT to create a model based on the TREC data
- Save the model and use it to classify texts
- Check for weak classes


## About BERT
The BERT family of models uses the Transformer encoder architecture to process each token of input text in the full context of all tokens before and after. This is the reason for the name: Bidirectional Encoder Representations from Transformers. 

BERT models are usually pre-trained on a large corpus of text, then fine-tuned for specific tasks. The example we will show here is one of the standard BERT models with fine-tuning on the movie review corpus.


## Setup


In [None]:
# A dependency of the preprocessing for BERT inputs
# Uncomment if you need to install this
#!pip install -q -U "tensorflow-text==2.8.*"

We will use the AdamW optimizer, which is currently the most commonly used bert optimizer, from [tensorflow/models](https://github.com/tensorflow/models).

In [None]:
#!pip install -q tf-models-official==2.7.0

In [None]:
#!pip install numpy==1.21

In [None]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

***
### Install the trec dataset
We will use the trec dataset

***

In [None]:
# find out the total number of text files in the dataset and what the classes are
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
training_ds = tf.keras.utils.text_dataset_from_directory('trec_processed/training')

class_names = training_ds.class_names
print(class_names)


In [None]:
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 200
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    'trec_processed/training',
    batch_size = batch_size,
    validation_split = 0.2,
    subset='training',
    seed=seed)

class_names = raw_train_ds.class_names
train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)

val_ds = tf.keras.utils.text_dataset_from_directory(
    'trec_processed/training',
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed)

val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

test_ds = tf.keras.utils.text_dataset_from_directory(
    'trec_processed/test',
    batch_size=batch_size)

test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
files_dict = {}
for class_name in class_names:
    files_count = train_ds.list_files('trec_processed/training/' + class_name + '/*.txt')
    files_length = files_count.cardinality().numpy()
    category_count = {class_name:files_length}
    files_dict.update(category_count)
    
# Sort the categories, largest first
from collections import OrderedDict
sorted_files_dict = sorted(files_dict.items(),
key=lambda t: t[1], reverse=True)
print(sorted_files_dict)

# Convert to Pandas series
pd_files_dict = pd.Series(dict(sorted_files_dict))

# Setting figure, ax into variables
fig, ax = plt.subplots(figsize=(20,10))

# plot
all_plot = sns.barplot(x=pd_files_dict.index,
y = pd_files_dict.values, ax=ax, palette = "Set2")
plt.xticks(rotation = 90)
plt.show()

In [None]:
import numpy as np
y_train = np.concatenate([y for x, y in train_ds], axis=0)
x_train = np.concatenate([x for x, y in train_ds], axis = 0)

y_val = np.concatenate([y for x, y in val_ds], axis=0)
x_val = np.concatenate([x for x, y in val_ds], axis = 0)

y_test = np.concatenate([y for x, y in test_ds], axis=0)
x_test = np.concatenate([x for x, y in test_ds], axis = 0)

validation_data = x_val,y_val


***
## Check a few examples of data and labels
***

In [None]:
for text_batch, label_batch in test_ds.take(1):
  for i in range(3):
    print(f'Review: {text_batch.numpy()[i]}')
    label = label_batch.numpy()[i]
    print(f'Label : {label} ({class_names[label]})')

***
## Loading models from TensorFlow Hub

We will use one of the smaller BERT models in order for it to run in a reasonable amount of time on a desktop computer. This is slightly bigger than the model we used in Chapter 11.
* "small_bert/bert_en_uncased_L-4_H-512_A-8/1"
* There are 4 hidden layers (that is, Transformer blocks), with a hidden size of 512
* A=8 Attention heads
This model was trained on Wikipedia and BooksCorpus.
There are many larger and smaller models that can be downloaded from the TensorFlow hub.
***

In [None]:

bert_model_name = 'small_bert/bert_en_uncased_L-4_H-512_A-8'  

map_name_to_handle = {
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
}

map_model_to_preprocess = {
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

## Preprocessing

Text inputs are transformed to numeric token ids and arranged in several Tensors before being input to BERT. 

TensorFlow Hub provides a matching preprocessing model for each of the BERT models discussed above

We will load the preprocessing model into a [hub.KerasLayer](https://www.tensorflow.org/hub/api_docs/python/hub/KerasLayer) to compose the fine-tuned model. 

In [None]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

Let's try the preprocessing model on some text and see the output:


In [None]:
test_text = ['sure is a great movie. i like it']
print(test_text)

text_preprocessed = bert_preprocess_model(test_text)

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

As you can see, now you have the 3 outputs from the preprocessing that a BERT model would use (`input_words_id`, `input_mask` and `input_type_ids`).

Since this text preprocessor is a TensorFlow model, It can be included in your model directly.

## Using the BERT model

Before putting BERT into our model, let's take a look at its outputs. Load it from TF Hub and see the returned values.

In [None]:
bert_model = hub.KerasLayer(tfhub_handle_encoder)

In [None]:
bert_results = bert_model(text_preprocessed)

print(f'Loaded BERT: {tfhub_handle_encoder}')
print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

The BERT models return a map with 3 important keys: `pooled_output`, `sequence_output`, `encoder_outputs`:

- `pooled_output` represents each input sequence as a whole. The shape is `[batch_size, H]`. You can think of this as an embedding for the entire text.
- `sequence_output` represents each input token in the context. The shape is `[batch_size, seq_length, H]`. You can think of this as a contextual embedding for every token in the text.
- `encoder_outputs` are the intermediate activations of the `L` Transformer blocks. `outputs["encoder_outputs"][i]` is a Tensor of shape `[batch_size, seq_length, 1024]` with the outputs of the i-th Transformer block, for `0 <= i < L`. The last value of the list is equal to `sequence_output`.

For the fine-tuning we are going to use the `pooled_output` array.

## Define the model

Here we create a very simple fine-tuned model, with the preprocessing model, the selected BERT model, one Dense and a Dropout layer. The parameter to the Dropout layer can be increased to make the model more robust. We are now working with a categorical classification problem (six classes), rather than a binary classification problem (two classes). Two changes that are needed in the model definition for the categorical task are in the final layer, which has six outputs, corresponding to the six classes, and a softmax activation function, as opposed to the sigmoid activation function that we used for binary problems. 


In [None]:
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(6, activation = tf.keras.activations.softmax, name='classifier')(net)
    return tf.keras.Model(text_input, net)

Let's check that the model runs with the output of the preprocessing model.

In [None]:
classifier_model = build_classifier_model()
bert_raw_result = classifier_model(tf.constant(test_text))
print(tf.keras.activations.softmax(bert_raw_result))

The output is meaningless, because the model has not been trained yet but it verifies that the model runs with the preprocessing.

Let's take a look at the model's structure.

In [None]:
tf.keras.utils.plot_model(classifier_model)

***
## Model training
We now have all the pieces to train a model, including the preprocessing module, BERT encoder, data, and classifier.
***

### Loss function and metrics

Since this is a categorical classification problem (that is, there are more than two outcomes, we'll use the "losses.CategoricalCrossEntropy" loss function. 
Similarly, the metric should be "CategoricalAccuracy".
Cross-entropy estimates the loss by scoring the average difference between the actual and predicted probability distributions for all classes.


In [None]:
#loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
loss = "sparse_categorical_crossentropy"

metrics = tf.metrics.CategoricalAccuracy()

### Optimizer

For fine-tuning, we'll use the same optimizer that BERT was originally trained with: the "Adaptive Moments" (Adam). Adam is popular because it is fast and efficient.

In line with the BERT paper, the initial learning rate is smaller for fine-tuning (best of 5e-5, 3e-5, 2e-5). BERT generally does best with very small learning rates for fine-tuning.

In [None]:
epochs = 8

steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
print(steps_per_epoch)

num_train_steps = steps_per_epoch * epochs
# a linear warmup phase over the first 10%
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5 

optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

### Loading the BERT model and training

Using the `classifier_model` you created earlier, you can compile the model with the loss, metric and optimizer, and take a look at the summary.
It's a good idea to check the model before starting a lengthy training process to make sure the model is as expected.

In [None]:
classifier_model.compile(optimizer=optimizer,
                         loss = loss,
                         metrics=metrics)
classifier_model.summary()

### Training
Training time will vary depending on the complexity of the selected BERT model.
For this model, dataset, and number of epochs, the training should take a few hours on a cpu.
Setting "verbose" to 2 is provides the maximum feedback during training and can be useful to see if things are not going as expected so that the training can be stopped.

In [None]:
print(f'Training model with {tfhub_handle_encoder}')

history = classifier_model.fit(x = train_ds,
                               validation_data = val_ds,
                               verbose = 2,
                               epochs = epochs)

### Evaluate the model

Let's see how the model performs on the test data. Two values will be returned -- loss and accuracy.

In [None]:
loss, accuracy = classifier_model.evaluate(test_ds)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

### Plot the accuracy and loss over time

With the `History` object returned by `model.fit()`, you can plot the training and validation loss for comparison, as well as the training and validation accuracy:

In [None]:
import matplotlib.pyplot as plt
#!matplotlib inline

history_dict = history.history
print(history_dict.keys())

acc = history_dict['categorical_accuracy']
val_acc = history_dict['val_categorical_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)
fig = plt.figure(figsize=(10, 6))
fig.tight_layout()

plt.subplot(2, 1, 1)
# r is for "solid red line"
plt.plot(epochs, loss, 'r', linestyle="dashed",label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
# plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(2, 1, 2)
plt.plot(epochs, acc, 'r', linestyle="dashed",label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.show()

In [None]:
predictions = classifier_model.predict(test_ds)

In [None]:
for text_batch, label_batch in test_ds.take(1):
  for i in range(3):
    print(f'Review: {text_batch.numpy()[i]}')
    label = label_batch.numpy()[i]
    print(f'Label : {label} ({class_names[label]})')
    prediction = classifier_model.predict(text_batch.numpy())
    print(prediction)

    #tf.math.confusion_matrix(labels=labels, predictions=predictions).numpy()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

scores = [[],[],[],[],[],[]]
plots = []
#initialize a variable to hold the predictions
for class_name, prediction in zip(class_names,predictions):
    print(class_name)
    classification = np.max(prediction)
    max_index = np.argmax(prediction)
    scores[max_index].append(classification)
    histogram = sns.histplot(scores,bins = 50,palette = "Set2")
    print(len(scores))
    plots.append(histogram)

for i in range(len(plots)):    
    plots[i].show()
    


In [None]:
#checking for weak classes
import matplotlib.pyplot as plt
import seaborn as sns

scores = [[],[],[],[],[],[]]

for text_batch, label_batch in train_ds.take(100):
    for i in range(160):
        text_to_classify = [text_batch.numpy()[i]]
        prediction = classifier_model.predict(text_to_classify)
        classification = np.max(prediction)
        max_index = np.argmax(prediction)
        scores[max_index].append(classification)


In [None]:
averages = []
for i in range(len(scores)):
    print(len(scores[i]))
    averages.append(np.average(scores[i]))
    
print(averages)

def make_histogram(score_data,class_name):
    sns.histplot(score_data,bins = 100)
    plt.xlabel("Probability Score")
    plt.title(class_name)
    plt.show()

for i in range(len(scores)):
    make_histogram(scores[i],class_names[i])


In [None]:
y_pred = classifier_model.predict(x_test)
y_pred = np.where(y_pred > .5, 1,0)
print(y_pred)
print(y_test)

predicted_classes = []
for i in range(len(y_pred)):
    max_index = np.argmax(y_pred[i])
    predicted_classes.append(max_index)
# View the results as a confusion matrix
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay,f1_score,classification_report
conf_matrix = confusion_matrix(y_test,predicted_classes,normalize=None)


In [None]:
print(class_names)

In [None]:
# View the results as a confusion matrix
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, predicted_classes,normalize=None)
print(conf_matrix)


In [None]:
text = ["who is gerald ford", "where is the washington monument"]

one_prediction = classifier_model.predict(text)
#print(one_prediction)
print(one_prediction[0:])
single = one_prediction[1]
print(single)
for prob in single:
    result = round(prob,4)
    print(result)
    
    


In [None]:
y_pred = classifier_model.predict(x_test)
y_pred = np.where(y_pred > .5, 1,0)
print(y_pred)
print(y_test)

predicted_classes = []
for i in range(len(y_pred)):
    max_index = np.argmax(y_pred[i])
    predicted_classes.append(max_index)
print(predicted_classes)

In [None]:
# View the results as a confusion matrix
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, predicted_classes,normalize=None)
print(conf_matrix)


In [None]:
# Displaying the confusion matrix
#%matplotlib inline
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay,f1_score,classification_report
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 12})

disp = ConfusionMatrixDisplay(confusion_matrix = conf_matrix,
                               display_labels = class_names)
print(class_names)
disp.plot(xticks_rotation=75,cmap=plt.cm.Blues)

plt.show()
print(classification_report(y_test, predicted_classes, target_names = class_names))