### Quantized model benchmarking
Computation of the accuracies of the quantized models

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys 
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.mobilenet import preprocess_input
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
# configuration parameters 
TEST_DATA_DIR = '/Users/sanchit/Documents/Projects/Datasets/animals/test/'
TEST_DATA_DIR = './test-hippo/'
MODEL_PATH = "./models/mobilenet-hippo.h5"
TFLITE_MODEL_DIR = "./models/tflite-hippo/"
# choose the model type here
QUANT_TYPE = "both_int8" # no_quant, w_int8, w_fp16, both_int8, both_fp16
TEST_SAMPLES = 300
NUM_CLASSES = 3
IMG_WIDTH, IMG_HEIGHT = 224, 224
BATCH_SIZE = 64
LABELS = ["cats", "dogs", "panda"]
LABELS = ["hippo", "other"]
QUANT_NAME_MAP = {"no_quant": "no quantization", "w_int8": "weights 8-bit INT quantized", 
                  "w_fp16": "weights 16-bit FP quantized", "both_int8": "both weights and activations INT8 quantized", 
                 "both_fp16": "both weights and activations FP-16 quantized"}

### Load quantized model according to the type

In [None]:
if QUANT_TYPE == "no_quant":
    # model without any quantization
    print(f"interpreter for {QUANT_NAME_MAP[QUANT_TYPE]} loading ...")
    interpret = tf.lite.Interpreter(model_path = TFLITE_MODEL_DIR + "mobilenet_no_quant.tflite")
    interpret.allocate_tensors() # allocate memory to the model
    
elif QUANT_TYPE == "w_int8":
    # model with weights INT8 quantization
    print(f"interpreter for {QUANT_NAME_MAP[QUANT_TYPE]} loading ...")
    interpret = tf.lite.Interpreter(model_path = TFLITE_MODEL_DIR + "mobilenet_weights_int8_quant.tflite")
    interpret.allocate_tensors() # allocate memory to the model
    
elif QUANT_TYPE == "w_fp16":
    # model with weights FP16 quantization 
    print(f"interpreter for {QUANT_NAME_MAP[QUANT_TYPE]} loading ...")
    interpret = tf.lite.Interpreter(model_path = TFLITE_MODEL_DIR + "mobilenet_weights_float16_quant.tflite")
    interpret.allocate_tensors() # allocate memory to the model
    
elif QUANT_TYPE == "both_int8":
    # model with both weights and activations INT8 quantization 
    print(f"interpreter for {QUANT_NAME_MAP[QUANT_TYPE]} loading ...")
    interpret = tf.lite.Interpreter(model_path = TFLITE_MODEL_DIR + "mobilenet_both_int8_quant.tflite")
    interpret.allocate_tensors() # allocate memory to the model
    
elif QUANT_TYPE == "both_fp16":
    # model with both weights and activations INT8 quantization 
    print(f"interpreter for {QUANT_NAME_MAP[QUANT_TYPE]} loading ...")
    interpret = tf.lite.Interpreter(model_path = TFLITE_MODEL_DIR + "mobilenet_both_fp16_quant.tflite")
    interpret.allocate_tensors() # allocate memory to the model

else:
    print(f"Wrong quantization type has been chosen for {QUANT_NAME_MAP[QUANT_TYPE]}")
    sys.exit(0)

In [5]:
# get indices of input and output tensors for each model 
input_ind = interpret.get_input_details()[0]["index"]
out_ind   = interpret.get_output_details()[0]["index"]

### Create test generator

In [None]:
test_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
test_generator = test_datagen.flow_from_directory(
    TEST_DATA_DIR,
    target_size=(IMG_WIDTH, IMG_HEIGHT),
    batch_size=1,
    shuffle=False,
    class_mode='categorical')

References for using interpreter: 
- (check evaluate the models): https://www.tensorflow.org/lite/performance/post_training_float16_quant 
- it uses batch generator and batch predictions (TODO later): https://thinkmobile.dev/testing-tensorflow-lite-image-classification-model/ 
- TinyML book chapter on Interpreter

Note: TFLite inference on a desktop is slower simply due to it is optimized for mobile devices (ARM processors) and not for Intel processors. For more details, see: 
- https://stackoverflow.com/questions/54093424/why-is-tensorflow-lite-slower-than-tensorflow-on-desktop 


In [None]:
%%time
# save the predicted class label (highest probability one) in a list
print(f"computing results for {QUANT_NAME_MAP[QUANT_TYPE]} ... \n")
pred = []
true = []
accuracy_count = 0
for i in range(TEST_SAMPLES): 
    
    print(f"computing results for {i}th image ...")
    
    # generate a batch of images 
    test_image = test_generator.next() 
    
    # set the input image to the input index 
    interpret.set_tensor(input_ind, test_image[0]) 
    
    # run the inference 
    interpret.invoke() 
    
    # read the predictions from the output tensor
    predictions = interpret.tensor(out_ind) # or, get_tensor(out_ind)
    
    # get the highest predicted class
    pred_class = np.argmax(predictions()[0])
    true_class = test_generator.classes[i]
    
    #print("predicted class: ", pred_class, " and actual class: ", test_generator.classes[i])
    
    pred.append(pred_class)
    true.append(true_class)
    
    if pred_class == test_generator.classes[i]:
        accuracy_count += 1 

In [None]:
# compute the accuracy percentage
print(f"accuracy percentage for {QUANT_NAME_MAP[QUANT_TYPE]}: {round((accuracy_count / TEST_SAMPLES) * 100, 3)}% \n")

In [None]:
# Plot confusion matrix, classification report
print("-"*50)
print(f"Confusion matrix for {QUANT_NAME_MAP[QUANT_TYPE]}: \n")
print(confusion_matrix(y_true=true, y_pred=pred))
print("-"*50)
print(f"Classification report for {QUANT_NAME_MAP[QUANT_TYPE]}: \n")
print(classification_report(y_true=true, y_pred=pred, target_names=LABELS))
print("-"*50)