In [11]:
import tensorflow as tf
import numpy as np
import time
import os
from sklearn.metrics import accuracy_score

model = tf.keras.models.load_model('/content/mnist_cnn.h5')

converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
tflite_quantized_model = converter.convert()

with open('/content/mnist_cnn_quantized.tflite', 'wb') as f:
    f.write(tflite_quantized_model)

original_model_size = os.path.getsize('/content/mnist_cnn.h5')
quantized_model_size = os.path.getsize('/content/mnist_cnn_quantized.tflite')

(_, _), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_test = x_test.astype('float32') / 255.0
x_test = np.expand_dims(x_test, -1)

start_time = time.time()
y_pred = model.predict(x_test, verbose=0)
original_inference_time = (time.time() - start_time) / len(x_test)
original_accuracy = accuracy_score(y_test, np.argmax(y_pred, axis=1))

interpreter = tf.lite.Interpreter(model_path="/content/mnist_cnn_quantized.tflite")
interpreter.allocate_tensors()
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

predictions = []
inference_times = []

for i in range(len(x_test)):
    input_data = np.expand_dims(x_test[i], axis=0).astype(np.float32)
    interpreter.set_tensor(input_details[0]['index'], input_data)

    start = time.time()
    interpreter.invoke()
    inference_times.append(time.time() - start)

    output = interpreter.get_tensor(output_details[0]['index'])
    predictions.append(np.argmax(output))

quantized_accuracy = accuracy_score(y_test, predictions)
quantized_inference_time = np.mean(inference_times)

print("\nModel Comparison:")
print("| Metric               | Original Model       | Quantized Model       |")
print("|----------------------|----------------------|------------------------|")
print(f"| Size (bytes)         | {original_model_size:<20} | {quantized_model_size:<22} |")
print(f"| Accuracy             | {original_accuracy:.4f}               | {quantized_accuracy:.4f}               |")
print(f"| Avg Inference Time   | {original_inference_time:.6f} s        | {quantized_inference_time:.6f} s        |")




Saved artifact at '/tmp/tmp4mvx5rrd'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 28, 28, 1), dtype=tf.float32, name='input_layer')
Output Type:
  TensorSpec(shape=(None, 10), dtype=tf.float32, name=None)
Captures:
  133633793804048: TensorSpec(shape=(), dtype=tf.resource, name=None)
  133633793817680: TensorSpec(shape=(), dtype=tf.resource, name=None)
  133633671455056: TensorSpec(shape=(), dtype=tf.resource, name=None)
  133633671456784: TensorSpec(shape=(), dtype=tf.resource, name=None)
  133633671456400: TensorSpec(shape=(), dtype=tf.resource, name=None)
  133633671448720: TensorSpec(shape=(), dtype=tf.resource, name=None)
  133633671448912: TensorSpec(shape=(), dtype=tf.resource, name=None)
  133633671447760: TensorSpec(shape=(), dtype=tf.resource, name=None)

Model Comparison:
| Metric               | Original Model       | Quantized Model       |
|----------------------|----------------------|--------------------