# Baseline network + pruning + quantization

This notebook trains the baseline network with exact same architecture as the one in paper [Recognition of handwritten Latin characters with diacritics using CNN](https://journals.pan.pl/dlibra/publication/136210/edition/119099/content/bulletin-of-the-polish-academy-of-sciences-technical-sciences-recognition-of-handwritten-latin-characters-with-diacritics-using-cnn-lukasik-edyta-charytanowicz-malgorzata-milosz-marek-tokovarov-michail-kaczorowska-monika-czerwinski-dariusz-zientarski-tomasz-2021-69-no-1?language=en)

Model architecture description:

"The architecture of the concrete CNN is shown in Fig. 2.
The input is a 32x32 binarized matrix. The input is then prop-
agated through 12 adaptable layers. First come two convolu-
tional layers having 32 filters with the size of 3x3 and stride 1.
Secondly, the output of the convolutional layer is fed to the
ReLU function. The output is down-sampled using a max-pool-
ing operation with a 2x2 stride. Next, the dropout technique is
used with the coefficient 0.25. The four operations (two con-
volutions, nonlinearity, max-pooling, and dropout) are repeated,
using 64 filters for the convolutional layers. The output of the
last layer is then flattened and fed through a fully connected
layer with 256 neurons and ReLU nonlinearities, dropped out
with the 0.25 coefficient, and a final output layer is fully con-
nected with a Softmax activation function. The Adam optimizer
and the cross-entropy loss function were used in the network. 
The output is a probability distribution over 89 classes."


Additionally, this network was pruned and quantized after training.



After training, model is serialized and uploaded to W&B project.

In [None]:
! pip install -q tensorflow-model-optimization

In [None]:
! pip install -U tensorboard_plugin_profile

In [None]:
import pathlib
import shutil
import os
import datetime
import numpy as np
import wandb
from wandb.keras import WandbCallback
import tensorflow as tf
import tensorflow_model_optimization as tfmot
import matplotlib.pyplot as plt

from typing import List

def load_data(run, artifact_name = "phcd_paper_splits_tfds") -> List[tf.data.Dataset]:
    """
    Downloads datasets from a wandb artifact and loads them into a list of tf.data.Datasets.
    """

    artifact = run.use_artifact(f"master-thesis/{artifact_name}:latest")
    artifact_dir = pathlib.Path(
        f"./artifacts/{artifact.name.replace(':', '-')}"
    ).resolve()
    if not artifact_dir.exists():
        artifact_dir = artifact.download()
        artifact_dir = pathlib.Path(artifact_dir).resolve()

    # if tf._version_ minor is less than 10, use
    # tf.data.experimental.load instead of tf.data.Dataset.load

    if int(tf._version_.split(".")[1]) < 10:
        load_function = tf.data.experimental.load
    else:
        load_function = tf.data.Dataset.load
    
    output_list = []
    for split in ["train", "test", "val"]:
        ds = load_function(str(artifact_dir / split), compression="GZIP")
        output_list.append(ds)
    
    return output_list

def get_number_of_classes(ds: tf.data.Dataset) -> int:
    """
    Returns the number of classes in a dataset.
    """
    labels_iterator= ds.map(lambda x, y: y).as_numpy_iterator()
    labels = np.concatenate(list(labels_iterator))
    return len(np.unique(labels))

def get_number_of_examples(ds: tf.data.Dataset) -> int:
    """
    Returns the number of examples in a dataset.
    """
    return sum(1 for _ in ds)

def preprocess_dataset(ds: tf.data.Dataset, batch_size: int, cache: bool = True) -> tf.data.Dataset:
    ds = ds.map(lambda x, y: (tf.cast(x, tf.float32) / 255.0, y))  # normalize
    ds = ds.unbatch().batch(batch_size)
    if cache:
        ds = ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
    return ds

def calculate_model_size_on_disk(path: str) -> int:
    return pathlib.Path(path).stat().st_size    

def calculate_model_num_parameters(model: tf.keras.Model) -> int:
    return model.count_params()

def calculate_model_flops(summary) -> float:
    # from run.summary get GFLOPs or GFLOPS whichever is available
    if "GFLOPs" in summary.keys():
        return summary.get("GFLOPs")
    elif "GFLOPS" in summary.keys():
        return summary.get("GFLOPS")
    else:
        return 0

def get_flops(model_h5_path):
    session = tf.compat.v1.Session()
    graph = tf.compat.v1.get_default_graph()
    
    # silence tensorflow warnings and info messages
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

    with graph.as_default():
        with session.as_default():
            model = tf.keras.models.load_model(model_h5_path)

            run_meta = tf.compat.v1.RunMetadata()
            opts = tf.compat.v1.profiler.ProfileOptionBuilder.float_operation()
        
            # We use the Keras session graph in the call to the profiler.
            flops = tf.compat.v1.profiler.profile(graph=graph,
                                                  run_meta=run_meta, cmd='op', options=opts)
        
            return flops.total_float_ops

def plot_history(history, title):
    plt.figure(figsize=(15,7))
    plt.suptitle(title)
    
    plt.subplot(121)
    plt.plot(history.history['accuracy'], label='train')
    plt.plot(history.history['val_accuracy'], label='val')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend()
    
    plt.subplot(122)
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='val')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend()

In [None]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print("Available devices: ", tf.config.list_physical_devices())

# Baseline model

In [None]:
defaults = dict(
    batch_size=32*2,
    epochs=100,    
    optimizer="adam"
)

model_name = "baseline"

artifact_name = "phcd_paper_splits_tfds"
run = wandb.init(project="master-thesis", job_type="training", name=model_name, config=defaults, tags=[artifact_name])
    
# hyperparameters
epochs = wandb.config.epochs
bs = wandb.config.batch_size

ds_train, ds_test, ds_val = load_data(run, artifact_name=artifact_name)

num_classes = get_number_of_classes(ds_val)

ds_train = preprocess_dataset(ds_train, batch_size=bs)
ds_val = preprocess_dataset(ds_val, batch_size=bs)
ds_test = preprocess_dataset(ds_test, batch_size=bs, cache=False)

'''
The architecture of the concrete CNN is shown in Fig. 2.
The input is a 32x32 binarized matrix. 
The input is then propagated through 12 adaptable layers. 
First come two convolutional layers having 32 filters with the size of 3x3 and stride 1.
Secondly, the output of the convolutional layer is fed to the
ReLU function. The output is down-sampled using a max-pool-
ing operation with a 2x2 stride. Next, the dropout technique is
used with the coefficient 0.25. The four operations (two con-
volutions, nonlinearity, max-pooling, and dropout) are repeated,
using 64 filters for the convolutional layers. The output of the
last layer is then flattened and fed through a fully connected
layer with 256 neurons and ReLU nonlinearities, dropped out
with the 0.25 coefficient, and a final output layer is fully con-
nected with a Softmax activation function. The Adam optimizer
and the cross-entropy loss function were used in the network
[24]. The output is a probability distribution over 89 classes.
'''

model = tf.keras.Sequential(
    [
        tf.keras.layers.InputLayer(input_shape=(32, 32, 1)),
        
        # 2 Convolutional layers with 32 filters, 3x3 size, and stride 1
        tf.keras.layers.Conv2D(32, (3, 3), strides=1, activation='relu'),
        tf.keras.layers.Conv2D(32, (3, 3), strides=1, activation='relu'),
        
        # Max-pooling operation with 2x2 stride
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=2),
        # Dropout with coefficient 0.25
        tf.keras.layers.Dropout(0.25),
        
        # Repeat above 4 operations using 64 filters
        tf.keras.layers.Conv2D(64, (3, 3), strides=1, activation='relu'),
        tf.keras.layers.Conv2D(64, (3, 3), strides=1, activation='relu'),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=2),
        tf.keras.layers.Dropout(0.25),
        
        # Flatten the output and feed through fully connected layer with 256 neurons
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(256, activation='relu'),

        # Dropout with coefficient 0.25
        tf.keras.layers.Dropout(0.25),
        tf.keras.layers.Dense(num_classes, activation="softmax"),
    ]
)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)
model.summary()

In [None]:
history = model.fit(
    ds_train,
    epochs=epochs,
    validation_data=ds_val,
    callbacks=[
        WandbCallback(
            compute_flops=True, 
            save_model=False, 
            log_weights=False, 
            log_gradients=False
        )
    ],
)

plot_history(history, "Baseline")
tf.keras.models.save_model(model, 'model_baseline.h5', include_optimizer=False)

In [None]:
# calculate model size on disk, flops and number of parameters

flops = get_flops('model_baseline.h5')
disk_size = calculate_model_size_on_disk('model_baseline.h5')
num_parameters = calculate_model_num_parameters(model)

# evaluate model on ds_test and log to wandb
test_loss_before, test_acc_before = model.evaluate(ds_test)

data_to_log = {
    "test loss": test_loss_before, 
    "test accuracy": test_acc_before, 
    "number of parameters": num_parameters,
    "disk size": disk_size, 
    "model flops": flops,
    }
print(data_to_log)
wandb.log(data_to_log)
run.finish()

# Quantization - allows for better accuracy

In [None]:
run = wandb.init(project="master-thesis", job_type="training", name=f"{model_name}_quantized", config=defaults, tags=[artifact_name])

quantize_model = tfmot.quantization.keras.quantize_model

quant_epochs = 2
base_model = tf.keras.models.load_model('model_baseline.h5')

# q_aware stands for for quantization aware.
q_aware_model = quantize_model(base_model)

# `quantize_model` requires a recompile.
q_aware_model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

q_aware_model.summary()

In [None]:
history = q_aware_model.fit(
    ds_train,
    epochs=quant_epochs,
    validation_data=ds_val,
    callbacks=[
        WandbCallback(
            compute_flops=True, 
            save_model=False, 
            log_weights=False, 
            log_gradients=False
        )
    ],
)


plot_history(history, "Quantized")
tf.keras.models.save_model(model, 'model_quantized.h5', include_optimizer=False)

In [None]:
# calculate model size on disk, flops and number of parameters

flops = get_flops('model_quantized.h5')
disk_size = calculate_model_size_on_disk('model_quantized.h5')
num_parameters = calculate_model_num_parameters(q_aware_model)

# evaluate model on ds_test and log to wandb
test_loss_after, test_acc_after = model.evaluate(ds_test)

data_to_log = {
    "test loss": test_loss_after, 
    "test accuracy": test_acc_after, 
    "number of parameters": num_parameters,
    "disk size": disk_size, 
    "model flops": flops,
    }
print(data_to_log)
wandb.log(data_to_log)

test_acc_quantize = test_acc_after
run.finish()

# Pruning model - allows for smaller size (after compression)

In [None]:
run = wandb.init(project="master-thesis", job_type="training", name=f"{model_name}_pruned", config=defaults, tags=[artifact_name])
# pruning
prune_low_magnitude = tfmot.sparsity.keras.prune_low_magnitude

prune_epochs = 4
num_images = get_number_of_examples(ds_train)
end_step = np.ceil(num_images / prune_epochs).astype(np.int32) * prune_epochs

# Define model for pruning.
pruning_params = {
    'pruning_schedule': tfmot.sparsity.keras.PolynomialDecay(initial_sparsity=0.50,
                                                            final_sparsity=0.80,
                                                            begin_step=0,
                                                            end_step=end_step,
                                                            frequency=100)
}

base_model = tf.keras.models.load_model('model_baseline.h5')

model_sparse = prune_low_magnitude(base_model, **pruning_params)

# `prune_low_magnitude` requires a recompile.
model_sparse.compile(optimizer='adam',
            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
            metrics=['accuracy'])

model_sparse.summary()

In [None]:
callbacks = [
    tfmot.sparsity.keras.UpdatePruningStep(),
    tfmot.sparsity.keras.PruningSummaries(log_dir='./prune_summary'),
    WandbCallback(
            compute_flops=True, 
            save_model=False, 
            log_weights=False, 
            log_gradients=False
        )
]

history = model_sparse.fit(
    ds_train,
    epochs=prune_epochs,
    validation_data=ds_val,
    callbacks=callbacks
)

plot_history(history, "Pruned")
tf.keras.models.save_model(model_sparse, 'model_sparse.h5', include_optimizer=False)

In [None]:
# calculate model size on disk, flops and number of parameters

flops = get_flops('model_sparse.h5')
disk_size = calculate_model_size_on_disk('model_sparse.h5')
num_parameters = calculate_model_num_parameters(model_sparse)

# evaluate model on ds_test and log to wandb
test_loss_after, test_acc_after = model_sparse.evaluate(ds_test)

data_to_log = {
    "test loss": test_loss_before, 
    "test accuracy": test_acc_before, 
    "number of parameters": num_parameters,
    "disk size": disk_size, 
    "model flops": flops,
    }
print(data_to_log)
wandb.log(data_to_log)

test_acc_prune = test_acc_after
run.finish()

# PQAT - pruning preserving quantization aware training

In [None]:
run = wandb.init(project="master-thesis", job_type="training", name=f"{model_name}_pqat", config=defaults, tags=[artifact_name])

# PQAT
quant_aware_annotate_model = tfmot.quantization.keras.quantize_annotate_model(
              model_sparse)
pqat_model = tfmot.quantization.keras.quantize_apply(
              quant_aware_annotate_model,
              tfmot.experimental.combine.Default8BitPrunePreserveQuantizeScheme())

pqat_model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
              
callbacks = [
    tfmot.sparsity.keras.UpdatePruningStep(),
    tfmot.sparsity.keras.PruningSummaries(log_dir='./prune_summary'),
    WandbCallback(
            compute_flops=True, 
            save_model=False, 
            log_weights=False, 
            log_gradients=False
        )
]

history = quant_aware_annotate_model.fit(
    ds_train,
    epochs=prune_epochs,
    validation_data=ds_val,
    callbacks=callbacks,
)
plot_history(history, "Pruned model + PQAT")
tf.keras.models.save_model(quant_aware_annotate_model, 'model_quantized_sparse.h5', include_optimizer=False)

In [None]:
# calculate model size on disk, flops and number of parameters
flops = get_flops('model_quantized_sparse.h5')
disk_size = calculate_model_size_on_disk('model_quantized_sparse.h5')
num_parameters = calculate_model_num_parameters(quant_aware_annotate_model)

# evaluate model on ds_test and log to wandb
test_loss_after, test_acc_after = quant_aware_annotate_model.evaluate(ds_test)

data_to_log = {
    "test loss": test_loss_before, 
    "test accuracy": test_acc_before, 
    "number of parameters": num_parameters,
    "disk size": disk_size, 
    "model flops": flops,
    }
print(data_to_log)
wandb.log(data_to_log)

test_acc_pqat = test_acc_after
run.finish()

# Summary & Tensorboard

In [None]:
print('Baseline test accuracy:', test_acc_before) 
print('Quantized test accuracy:', test_acc_quantize)
print('Pruned test accuracy:', test_acc_prune)
print('PQAT test accuracy:', test_acc_pqat)

In [None]:
import zipfile

with zipfile.ZipFile('model_baseline.zip', 'w', compression=zipfile.ZIP_DEFLATED) as f:
    f.write('model_baseline.h5')

with zipfile.ZipFile('model_quantized.zip', 'w', compression=zipfile.ZIP_DEFLATED) as f:
    f.write('model_quantized.h5')
    
with zipfile.ZipFile('model_sparse.zip', 'w', compression=zipfile.ZIP_DEFLATED) as f:
    f.write('model_sparse.h5')

with zipfile.ZipFile('model_quantized_sparse.zip', 'w', compression=zipfile.ZIP_DEFLATED) as f:
    f.write('model_quantized_sparse.h5')

print('Zipped model files size:\n')
b_size = os.path.getsize('model_baseline.zip')
q_size = os.path.getsize('model_quantized.zip')
s_size = os.path.getsize('model_sparse.zip')
q_s_size = os.path.getsize('model_quantized_sparse.zip')
print('Baseline: {} bytes'.format(b_size))
print('Quantized: {} bytes'.format(q_size))
print('Sparse:   {} bytes'.format(s_size))
print('PQAT:     {} bytes'.format(q_s_size))

print(f'Quantized model is {((b_size-q_size)/b_size * 100):.2f}% smaller')
print(f'Sparse model is {((b_size-s_size)/b_size * 100):.2f}% smaller')
print(f'PQAT model is {((b_size-q_s_size)/b_size * 100):.2f}% smaller')

In [None]:
%load_ext tensorboard
%tensorboard --logdir prune_summary