# OldBaseline_KLL_MLT22 Git revision: model.py (EXPLANATION)

Here below we have the whole model code. Let's break down it in detail.

In [None]:
import numpy as np
import h5py
# import setGPU
import argparse

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Lambda, BatchNormalization, Activation, Concatenate, Dropout, Layer
from tensorflow.keras.layers import ReLU, LeakyReLU
from tensorflow.keras import backend as K
#tf.keras.mixed_precision.set_global_policy('mixed_float16')
import math

from datetime import datetime
from tensorboard import program
import os
import pathlib
# import matplotlib
# import matplotlib.pyplot as plt
# matplotlib.use('agg')

import pickle
from autoencoder_classes import AE

from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, TerminateOnNaN
from neptunecontrib.monitoring.keras import NeptuneMonitor
from custom_layers import Sampling

from qkeras import QDense, QActivation, QBatchNormalization
import tensorflow_model_optimization as tfmot
tsk = tfmot.sparsity.keras


def build_AE(input_shape,latent_dim):
    
    inputArray = Input(shape=(input_shape))
    x = BatchNormalization()(inputArray)
    x = Dense(32, kernel_initializer=tf.keras.initializers.HeUniform())(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.3)(x)
    x = Dense(16, kernel_initializer=tf.keras.initializers.HeUniform())(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.3)(x)
    encoder = Dense(latent_dim, kernel_initializer=tf.keras.initializers.HeUniform())(x)
    # x = BatchNormalization()(x)
    # encoder = LeakyReLU(alpha=0.3)(x)
    #decoder
    x = Dense(16, kernel_initializer=tf.keras.initializers.HeUniform())(encoder)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.3)(x)
    x = Dense(32, kernel_initializer=tf.keras.initializers.HeUniform())(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.3)(x)
    decoder = Dense(input_shape, kernel_initializer=tf.keras.initializers.HeUniform())(x)

    #create autoencoder
    autoencoder = Model(inputs = inputArray, outputs=decoder)
    autoencoder.summary()
    # ae = AE(autoencoder)
    # ae.compile(optimizer=keras.optimizers.Adam(lr=0.00001))

    return autoencoder

def build_VAE_orig(input_shape,latent_dim):
    
    #encoder
    inputArray = Input(shape=(input_shape))
    x = BatchNormalization()(inputArray)
    x = Dense(32, kernel_initializer=tf.keras.initializers.HeUniform(seed=42))(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.3)(x)
    x = Dense(16, kernel_initializer=tf.keras.initializers.HeUniform(seed=42))(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.3)(x)
    mu = Dense(latent_dim, name = 'latent_mu', kernel_initializer=tf.keras.initializers.HeUniform(seed=42))(x)
    logvar = Dense(latent_dim, name = 'latent_logvar', kernel_initializer=tf.keras.initializers.HeUniform(seed=42))(x)

    # Use reparameterization trick to ensure correct gradient
    z = Sampling()([mu, logvar])

    # Create encoder
    encoder = Model(inputArray, [mu, logvar, z], name='encoder')
    encoder.summary()

    #decoder
    d_input = Input(shape=(latent_dim,), name='decoder_input')
    x = Dense(16, kernel_initializer=tf.keras.initializers.HeUniform(seed=42))(d_input)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.3)(x)
    x = Dense(32, kernel_initializer=tf.keras.initializers.HeUniform(seed=42))(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.3)(x)
    dec = Dense(input_shape, kernel_initializer=tf.keras.initializers.HeUniform(seed=42))(x)

    # Create decoder
    decoder = Model(d_input, dec, name='decoder')
    decoder.summary()
    
    # vae = VAE(encoder, decoder)
    # vae.compile(optimizer=keras.optimizers.Adam())

    return encoder,decoder

def build_VAE(input_shape,latent_dim):
    
    #encoder
    inputArray = Input(shape=(input_shape))
    x = Dense(32, kernel_initializer='lecun_uniform')(inputArray)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = Dense(16, kernel_initializer='lecun_uniform')(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    mu = Dense(latent_dim, name = 'latent_mu', kernel_initializer='zeros')(x)
    logvar = Dense(latent_dim, name = 'latent_logvar', kernel_initializer='zeros')(x)

    # Use reparameterization trick to ensure correct gradient
    z = Sampling()([mu, logvar])

    # Create encoder
    encoder = Model(inputArray, [mu, logvar, z], name='encoder')
    encoder.summary()

    #decoder
    d_input = Input(shape=(latent_dim,), name='decoder_input')
    y = Dense(16,kernel_initializer='lecun_uniform')(d_input)
    y = BatchNormalization()(y)
    y = ReLU()(y)
    y = Dense(32,kernel_initializer='lecun_uniform')(y)
    y = BatchNormalization()(y)
    y = ReLU()(y)
    y = Dense(64,kernel_initializer='lecun_uniform')(y)
    y = BatchNormalization()(y)
    y = ReLU()(y)
    y = Dense(128,kernel_initializer='lecun_uniform')(y)
    y = BatchNormalization()(y)
    y = ReLU()(y)
    dec = Dense(input_shape, kernel_initializer='lecun_uniform')(y)

    # Create decoder
    decoder = Model(d_input, dec, name='decoder')
    decoder.summary()
    
    # vae = VAE(encoder, decoder)
    # vae.compile(optimizer=keras.optimizers.Adam())

    return encoder,decoder

def build_VAE_nobn(input_shape,latent_dim):
    
    #encoder
    inputArray = Input(shape=(input_shape))
    x = Dense(32, kernel_initializer='lecun_uniform')(inputArray)
    x = ReLU()(x)
    x = Dense(16, kernel_initializer='lecun_uniform')(x)
    x = ReLU()(x)
    mu = Dense(latent_dim, name = 'latent_mu', kernel_initializer='zeros')(x)
    logvar = Dense(latent_dim, name = 'latent_logvar', kernel_initializer='zeros')(x)

    # Use reparameterization trick to ensure correct gradient
    z = Sampling()([mu, logvar])

    # Create encoder
    encoder = Model(inputArray, [mu, logvar, z], name='encoder')
    encoder.summary()

    #decoder
    d_input = Input(shape=(latent_dim,), name='decoder_input')
    y = Dense(16,kernel_initializer='lecun_uniform')(d_input)
    y = ReLU()(y)
    y = Dense(32,kernel_initializer='lecun_uniform')(y)
    y = ReLU()(y)
    y = Dense(64,kernel_initializer='lecun_uniform')(y)
    y = ReLU()(y)
    y = Dense(128,kernel_initializer='lecun_uniform')(y)
    y = ReLU()(y)
    dec = Dense(input_shape, kernel_initializer='lecun_uniform')(y)

    # Create decoder
    decoder = Model(d_input, dec, name='decoder')
    decoder.summary()
    
    # vae = VAE(encoder, decoder)
    # vae.compile(optimizer=keras.optimizers.Adam())

    return encoder,decoder

    
def build_QVAE(input_shape,latent_dim,quant_size=12,integer=4,symmetric=0,pruning='pruned',batch_size=1024):

    quant_size = 12
    integer = 4
    symmetric = 0
    pruning='pruned'

    if pruning == 'pruned':
        ''' How to estimate the enc step:
                num_samples = input_train.shape[0] * (1 - validation_split)
                end_step = np.ceil(num_samples / batch_size).astype(np.int32) * pruning_epochs
                so, stop pruning at the 7th epoch
        '''
        begin_step = np.ceil((input_shape*0.8)/batch_size).astype(np.int32)*5
        end_step = np.ceil((input_shape*0.8)/batch_size).astype(np.int32)*15
        print('Begin step: ' + str(begin_step) + ', End step: ' + str(end_step))
        
        pruning_schedule = tfmot.sparsity.keras.PolynomialDecay(
                                initial_sparsity=0.0, final_sparsity=0.5,
                                begin_step=begin_step, end_step=end_step)
        print(pruning_schedule.get_config())
    #encoder
    inputArray = Input(shape=(input_shape))
    x = QActivation(f'quantized_bits(16,10,0,alpha=1)')(inputArray)
    x = QBatchNormalization()(x)
    x = tsk.prune_low_magnitude(Dense(32, kernel_initializer=tf.keras.initializers.HeNormal(seed=42)),\
                                                pruning_schedule=pruning_schedule)(x) if quant_size==0\
        else tsk.prune_low_magnitude(QDense(32, kernel_initializer=tf.keras.initializers.HeNormal(seed=42),\
                kernel_quantizer='quantized_bits(' + str(quant_size) + ','+str(integer)+','+ str(symmetric) +'), alpha=1',\
                bias_quantizer='quantized_bits(' + str(quant_size) + ','+ str(integer) + ',' + str(symmetric) +', alpha=1)'),\
                                                pruning_schedule=pruning_schedule)(x)
    x = tsk.prune_low_magnitude(QBatchNormalization(), pruning_schedule=pruning_schedule)(x)
    x = tsk.prune_low_magnitude(Activation('relu'),pruning_schedule=pruning_schedule)(x) if quant_size==0\
        else tsk.prune_low_magnitude(QActivation('quantized_relu(bits=' + str(quant_size) + ')'),pruning_schedule=pruning_schedule)(x)
    x = tsk.prune_low_magnitude(Dense(16, kernel_initializer=tf.keras.initializers.HeNormal(seed=42)),\
                                            pruning_schedule=pruning_schedule)(x) if quant_size==0\
        else tsk.prune_low_magnitude(QDense(16, kernel_initializer=tf.keras.initializers.HeNormal(seed=42),\
                kernel_quantizer='quantized_bits(' + str(quant_size) + ','+str(integer)+','+ str(symmetric) +', alpha=1)',\
                bias_quantizer='quantized_bits(' + str(quant_size) + ','+ str(integer) + ',' + str(symmetric) +', alpha=1)'),\
                                    pruning_schedule=pruning_schedule)(x)
    x = tsk.prune_low_magnitude(QBatchNormalization(), pruning_schedule=pruning_schedule)(x)
    x = tsk.prune_low_magnitude(Activation('relu'),pruning_schedule=pruning_schedule)(x) if quant_size==0\
        else tsk.prune_low_magnitude(QActivation('quantized_relu(bits=' + str(quant_size) + ')'),\
                                    pruning_schedule=pruning_schedule)(x)
    mu = tsk.prune_low_magnitude(Dense(latent_dim, name = 'latent_mu', kernel_initializer=tf.keras.initializers.HeNormal(seed=42)))(x) if quant_size==0\
        else tsk.prune_low_magnitude(QDense(latent_dim, kernel_initializer=tf.keras.initializers.HeNormal(seed=42),\
                kernel_quantizer='quantized_bits(' + str(16) + ',6,'+ str(symmetric) +', alpha=1)',\
                bias_quantizer='quantized_bits(' + str(16) + ',6,'+ str(symmetric) +', alpha=1)'),\
                                    pruning_schedule=pruning_schedule)(x)
    logvar = tsk.prune_low_magnitude(Dense(latent_dim, name = 'latent_logvar', kernel_initializer=tf.keras.initializers.HeNormal(seed=42)),\
                                    pruning_schedule=pruning_schedule)(x) if quant_size==0\
        else tsk.prune_low_magnitude(QDense(latent_dim, kernel_initializer=tf.keras.initializers.HeNormal(seed=42),\
                kernel_quantizer='quantized_bits(' + str(16) + ',6,'+ str(symmetric) +', alpha=1)',\
                bias_quantizer='quantized_bits(' + str(16) + ',6,'+ str(symmetric) +', alpha=1)'),\
                                    pruning_schedule=pruning_schedule)(x)
    # Use reparameterization trick to ensure correct gradient
    z = Sampling()([mu, logvar])

    # Create encoder
    encoder = Model(inputArray, [mu, logvar, z], name='encoder')    
    encoder.summary()


    #decoder
    d_input = Input(shape=(latent_dim,), name='decoder_input')
    x = Dense(16, kernel_initializer=tf.keras.initializers.HeNormal(seed=42))(d_input)
    x = BatchNormalization()(x)
    #x = LeakyReLU(alpha=0.3)(x)
    x = Activation('relu')(x)
    x = Dense(32, kernel_initializer=tf.keras.initializers.HeNormal(seed=42))(x)    
    x = BatchNormalization()(x)
    #x = LeakyReLU(alpha=0.3)(x)
    x = Activation('relu')(x)
    dec = Dense(input_shape, kernel_initializer=tf.keras.initializers.HeNormal(seed=42))(x)
    # Create decoder
    decoder = Model(d_input, dec, name='decoder')
    decoder.summary()

    return encoder, decoder

## **Import Statements** 

In [None]:
import numpy as np
import h5py
#import setGPU
import argparse

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Lambda, BatchNormalization, Activation, Concatenate, Dropout, Layer
from tensorflow.keras.layers import ReLU, LeakyReLU
from tensorflow.keras import backend as K
#import matplotlib
#import matplotlib.pyplot as plt
#matplotlib.use('agg')

import pickle
from autoencoder_classes import AE

from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, TerminateOnNaN
from neptunecontrib.monitoring.keras import NeptuneMonitor
from custom_layers import Sampling

from qkeras import QDense, QActivation, QBatchNormalization
import tensorflow_model_optimization as tfmot
tsk = tfmot.sparsity.keras

<span style="color: red;"> Key task: Research about all the packages and dependencies that were called. </span>

### **Function Definition**
Now we will define a classical autoencoder structure that is an alternative to carry out the anomaly detection.
In this context, we will see four classical AE (including VAE). In summary, we will notice that the following differences in functionality exist:
* Autoencoder (`build_AE`):
    * Focuses on learning an efficient encoding for input data.
    * Primarily used for reconstructing input data and identifying anomalies based on reconstruction error.
* Variational Autoencoder (`build_VAE_orig`, `build_VAE`, `build_VAE_nobn`):
    * Adds a probabilistic layer by encoding inputs into a distribution rather than a fixed vector.
    * Useful for generating new data points and anomaly detection by assessing how well data fits into the learned distribution.
    * `build_VAE_orig` includes Batch Normalization and LeakyReLU.
    * `build_VAE` modifies the architecture with different initializers, ReLU, and additional layers.
    * `build_VAE_nobn` further simplifies the model by removing Batch Normalization.
    
Each model serves a slightly different purpose and might be chosen based on specific requirements of the anomaly detection task, such as computational efficiency, model complexity, and the nature of the data being analyzed.

#### ***build_AE***

In [None]:
def build_AE(input_shape, latent_dim):
    
    inputArray = Input(shape=(input_shape))
    x = BatchNormalization()(inputArray)
    x = Dense(32, kernel_initializer=tf.keras.initializers.HeUniform())(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.3)(x)
    x = Dense(16, kernel_initializer=tf.keras.initializers.HeUniform())(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.3)(x)
    encoder = Dense(latent_dim, kernel_initializer=tf.keras.initializers.HeUniform())(x)

    #decoder
    x = Dense(16, kernel_initializer=tf.keras.initializers.HeUniform())(encoder)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.3)(x)
    x = Dense(32, kernel_initializer=tf.keras.initializers.HeUniform())(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.3)(x)
    decoder = Dense(input_shape, kernel_initializer=tf.keras.initializers.HeUniform())(x)

    #create autoencoder
    autoencoder = Model(inputs=inputArray, outputs=decoder)
    autoencoder.summary()

    return autoencoder

* **Structure**:

    * ***Input***: Takes an input shape and passes it through the network. `inputArray = Input(shape=(input_shape))` defines the input shape for the model.
    * ***Encoder***:
        * Batch normalization
        * Dense layer (32 units, HeUniform initializer)
        * Batch normalization
        * LeakyReLU activation
        * Dense layer (16 units, HeUniform initializer)
        * Batch normalization
        * LeakyReLU activation
        * Dense layer for encoding to latent space (latent_dim units, HeUniform initializer). It encodes the input into a latent space of dimension latent_dim.
   * ***Decoder***: Mirrors the encoder to reconstruct the input. That is, it reverses the encoder operation.
        * Dense layer (16 units, HeUniform initializer)
        * Batch normalization
        * LeakyReLU activation
        * Dense layer (32 units, HeUniform initializer)
        * Batch normalization
        * LeakyReLU activation
        * Dense layer to reconstruct the input (same shape as input, HeUniform initializer). That is, it decodes back to the original input shape.

* **Functionality**:

    * The autoencoder compresses the input data into a smaller representation (latent space) and then reconstructs the original data from this representation. This is useful for capturing the most important features of the data and for anomaly detection by measuring reconstruction errors. It combines encoder and decoder into a single model and prints the model summary.

<span style="color: red;"> Key question: What is the latent space? </span>

#### ***build_VAE_orig***

In [None]:
def build_VAE_orig(input_shape, latent_dim):
    
    #encoder
    inputArray = Input(shape=(input_shape))
    x = BatchNormalization()(inputArray)
    x = Dense(32, kernel_initializer=tf.keras.initializers.HeUniform(seed=42))(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.3)(x)
    x = Dense(16, kernel_initializer=tf.keras.initializers.HeUniform(seed=42))(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.3)(x)
    mu = Dense(latent_dim, name='latent_mu', kernel_initializer=tf.keras.initializers.HeUniform(seed=42))(x)
    logvar = Dense(latent_dim, name='latent_logvar', kernel_initializer=tf.keras.initializers.HeUniform(seed=42))(x)

    # Use reparameterization trick to ensure correct gradient
    z = Sampling()([mu, logvar])

    # Create encoder
    encoder = Model(inputArray, [mu, logvar, z], name='encoder')
    encoder.summary()

    #decoder
    d_input = Input(shape=(latent_dim,), name='decoder_input')
    x = Dense(16, kernel_initializer=tf.keras.initializers.HeUniform(seed=42))(d_input)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.3)(x)
    x = Dense(32, kernel_initializer=tf.keras.initializers.HeUniform(seed=42))(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.3)(x)
    dec = Dense(input_shape, kernel_initializer=tf.keras.initializers.HeUniform(seed=42))(x)

    # Create decoder
    decoder = Model(d_input, dec, name='decoder')
    decoder.summary()
    
    return encoder, decoder

* **Structure**:

    * ***Input***: Takes an input shape and processes it through the network.
    * ***Encoder***:
        * Batch normalization
        * Dense layer (32 units, HeUniform initializer)
        * Batch normalization
        * LeakyReLU activation
        * Dense layer (16 units, HeUniform initializer)
        * Batch normalization
        * LeakyReLU activation
        * Dense layers for mean (mu) and log variance (logvar) of latent space (latent_dim units, HeUniform initializer)
        * Sampling layer to reparameterize (z)
   * ***Decoder***:
        * Dense layer (16 units, HeUniform initializer)
        * Batch normalization
        * LeakyReLU activation
        * Dense layer (32 units, HeUniform initializer)
        * Batch normalization
        * LeakyReLU activation
        * Dense layer to reconstruct the input (same shape as input, HeUniform initializer)
        
* **Functionality**:
    * The VAE adds a probabilistic twist to the autoencoder by encoding the input into a distribution defined by `mu` and `logvar`. The Sampling layer uses the reparameterization trick to ensure proper gradient flow during training. The decoder reconstructs the input from samples drawn from this latent distribution. This helps in generating new data samples and handling anomalies by comparing the probabilistic reconstruction.

<span style="color: red;"> Key question: How is this probabilistic twist? What is the nature of mu and logvar? How were they initialized and why? How it changes the nature of an AE? What reparameterization trick was used? How the Sampling Layer works? What does proper gradient flow mean? How is this latent distribution? </span>

#### ***build_VAE***

In [None]:
def build_VAE(input_shape, latent_dim):
    
    #encoder
    inputArray = Input(shape=(input_shape))
    x = Dense(32, kernel_initializer='lecun_uniform')(inputArray)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = Dense(16, kernel_initializer='lecun_uniform')(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    mu = Dense(latent_dim, name='latent_mu', kernel_initializer='zeros')(x)
    logvar = Dense(latent_dim, name='latent_logvar', kernel_initializer='zeros')(x)

    # Use reparameterization trick to ensure correct gradient
    z = Sampling()([mu, logvar])

    # Create encoder
    encoder = Model(inputArray, [mu, logvar, z], name='encoder')
    encoder.summary()

    #decoder
    d_input = Input(shape=(latent_dim,), name='decoder_input')
    y = Dense(16, kernel_initializer='lecun_uniform')(d_input)
    y = BatchNormalization()(y)
    y = ReLU()(y)
    y = Dense(32, kernel_initializer='lecun_uniform')(y)
    y = BatchNormalization()(y)
    y = ReLU()(y)
    y = Dense(64, kernel_initializer='lecun_uniform')(y)
    y = BatchNormalization()(y)
    y = ReLU()(y)
    y = Dense(128, kernel_initializer='lecun_uniform')(y)
    y = BatchNormalization()(y)
    y = ReLU()(y)
    dec = Dense(input_shape, kernel_initializer='lecun_uniform')(y)

    # Create decoder
    decoder = Model(d_input, dec, name='decoder')
    decoder.summary()
    
    return encoder, decoder

* **Structure**:

    * ***Input***: Takes an input shape and processes it through the network.
    * ***Encoder***:
        * Dense layer (32 units, lecun_uniform initializer)
        * Batch normalization
        * ReLU activation
        * Dense layer (16 units, lecun_uniform initializer)
        * Batch normalization
        * ReLU activation
        * Dense layers for mean (mu, initialized with zeros) and log variance (logvar, initialized with zeros) of latent space (latent_dim units)
        * Sampling layer to reparameterize (z)
    * ***Decoder***:
        * Dense layer (16 units, lecun_uniform initializer)
        * Batch normalization
        * ReLU activation
        * Dense layer (32 units, lecun_uniform initializer)
        * Batch normalization
        * ReLU activation
        * Additional dense layers (64 and 128 units, lecun_uniform initializer)
        * Batch normalization and ReLU activations
        * Dense layer to reconstruct the input (same shape as input, lecun_uniform initiailizer)

* **Functionality**:

    * Similar to build_VAE_orig, but with several modifications:
        * Uses `lecun_uniform` initializers.
        * More layers in the decoder for potentially better reconstruction.
        * `ReLU` activations instead of `LeakyReLU`. 
        * `mu` and `logvar` initialized with zeros.

<span style="color: red;"> Key question: What is the different between lecun_uniform and HeUnfiorm initializers? Why lecun_uniform was chosen in this case? Why ReLU is preferred over LeakyRELU in this case? Why were mu and logvar initialized with zeros?  </span>

#### ***build_VAE_nobn***

In [3]:
def build_VAE_nobn(input_shape, latent_dim):
    
    #encoder
    inputArray = Input(shape=(input_shape))
    x = Dense(32, kernel_initializer='lecun_uniform')(inputArray)
    x = ReLU()(x)
    x = Dense(16, kernel_initializer='lecun_uniform')(x)
    x = ReLU()(x)
    mu = Dense(latent_dim, name='latent_mu', kernel_initializer='zeros')(x)
    logvar = Dense(latent_dim, name='latent_logvar', kernel_initializer='zeros')(x)

    # Use reparameterization trick to ensure correct gradient
    z = Sampling()([mu, logvar])

    # Create encoder
    encoder = Model(inputArray, [mu, logvar, z], name='encoder')
    encoder.summary()

    #decoder
    d_input = Input(shape=(latent_dim,), name='decoder_input')
    y = Dense(16, kernel_initializer='lecun_uniform')(d_input)
    y = ReLU()(y)
    y = Dense(32, kernel_initializer='lecun_uniform')(y)
    y = ReLU()(y)
    y = Dense(64, kernel_initializer='lecun_uniform')(y)
    y = ReLU()(y)
    y = Dense(128, kernel_initializer='lecun_uniform')(y)
    y = ReLU()(y)
    dec = Dense(input_shape, kernel_initializer='lecun_uniform')(y)

    # Create decoder
    decoder = Model(d_input, dec, name='decoder')
    decoder.summary()
    
    return encoder, decoder

* **Structure**:
    * ***Input***: Takes an input shape and processes it through the network.
    * ***Encoder***:
        * Dense layer (32 units, lecun_uniform initializer)
        * ReLU activation
        * Dense layer (16 units, lecun_uniform initializer)
        * ReLU activation
        * Dense layers for mean (mu, initialized with zeros) and log variance (logvar, initialized with zeros) of latent space (latent_dim units)
        * Sampling layer to reparameterize (z)
    * ***Decoder***:
        * Dense layer (16 units, lecun_uniform initializer)
        * ReLU activation
        * Dense layer (32 units, lecun_uniform initializer)
        * ReLU activation
        * Additional dense layers (64 and 128 units, lecun_uniform initializer)
        * ReLU activations
        * Dense layer to reconstruct the input (same shape as input, lecun_uniform initializer)
* **Functionality**:
    * Similar to build_VAE, but without Batch Normalization layers.
    * Simplifies the model, potentially reducing training time and computational resources, at the expense of some benefits provided by Batch Normalization.

<span style="color: red;"> Key question: How much is the influence of don't use Batch Normalization? Why is it preferred? </span>

#### ***build_QVAE***

This function builds a Quantized VAE with optional pruning of less significant weights to achieve a sparse model. This function combines concepts from VAE, quantization, and pruning:

In [None]:
def build_QVAE(input_shape,latent_dim,quant_size=12,integer=4,symmetric=0,pruning='pruned',batch_size=1024):

    quant_size = 12
    integer = 4
    symmetric = 0
    pruning='pruned'

    if pruning == 'pruned':
        ''' How to estimate the enc step:
                num_samples = input_train.shape[0] * (1 - validation_split)
                end_step = np.ceil(num_samples / batch_size).astype(np.int32) * pruning_epochs
                so, stop pruning at the 7th epoch
        '''
        begin_step = np.ceil((input_shape*0.8)/batch_size).astype(np.int32)*5
        end_step = np.ceil((input_shape*0.8)/batch_size).astype(np.int32)*15
        print('Begin step: ' + str(begin_step) + ', End step: ' + str(end_step))
        
        pruning_schedule = tfmot.sparsity.keras.PolynomialDecay(
                                initial_sparsity=0.0, final_sparsity=0.5,
                                begin_step=begin_step, end_step=end_step)
        print(pruning_schedule.get_config())
    #encoder
    inputArray = Input(shape=(input_shape))
    x = QActivation(f'quantized_bits(16,10,0,alpha=1)')(inputArray)
    x = QBatchNormalization()(x)
    x = tsk.prune_low_magnitude(Dense(32, kernel_initializer=tf.keras.initializers.HeNormal(seed=42)),\
                                                pruning_schedule=pruning_schedule)(x) if quant_size==0\
        else tsk.prune_low_magnitude(QDense(32, kernel_initializer=tf.keras.initializers.HeNormal(seed=42),\
                kernel_quantizer='quantized_bits(' + str(quant_size) + ','+str(integer)+','+ str(symmetric) +'), alpha=1',\
                bias_quantizer='quantized_bits(' + str(quant_size) + ','+ str(integer) + ',' + str(symmetric) +', alpha=1)'),\
                                                pruning_schedule=pruning_schedule)(x)
    x = tsk.prune_low_magnitude(QBatchNormalization(), pruning_schedule=pruning_schedule)(x)
    x = tsk.prune_low_magnitude(Activation('relu'),pruning_schedule=pruning_schedule)(x) if quant_size==0\
        else tsk.prune_low_magnitude(QActivation('quantized_relu(bits=' + str(quant_size) + ')'),pruning_schedule=pruning_schedule)(x)
    x = tsk.prune_low_magnitude(Dense(16, kernel_initializer=tf.keras.initializers.HeNormal(seed=42)),\
                                            pruning_schedule=pruning_schedule)(x) if quant_size==0\
        else tsk.prune_low_magnitude(QDense(16, kernel_initializer=tf.keras.initializers.HeNormal(seed=42),\
                kernel_quantizer='quantized_bits(' + str(quant_size) + ','+str(integer)+','+ str(symmetric) +', alpha=1)',\
                bias_quantizer='quantized_bits(' + str(quant_size) + ','+ str(integer) + ',' + str(symmetric) +', alpha=1)'),\
                                    pruning_schedule=pruning_schedule)(x)
    x = tsk.prune_low_magnitude(QBatchNormalization(), pruning_schedule=pruning_schedule)(x)
    x = tsk.prune_low_magnitude(Activation('relu'),pruning_schedule=pruning_schedule)(x) if quant_size==0\
        else tsk.prune_low_magnitude(QActivation('quantized_relu(bits=' + str(quant_size) + ')'),\
                                    pruning_schedule=pruning_schedule)(x)
    mu = tsk.prune_low_magnitude(Dense(latent_dim, name = 'latent_mu', kernel_initializer=tf.keras.initializers.HeNormal(seed=42)))(x) if quant_size==0\
        else tsk.prune_low_magnitude(QDense(latent_dim, kernel_initializer=tf.keras.initializers.HeNormal(seed=42),\
                kernel_quantizer='quantized_bits(' + str(16) + ',6,'+ str(symmetric) +', alpha=1)',\
                bias_quantizer='quantized_bits(' + str(16) + ',6,'+ str(symmetric) +', alpha=1)'),\
                                    pruning_schedule=pruning_schedule)(x)
    logvar = tsk.prune_low_magnitude(Dense(latent_dim, name = 'latent_logvar', kernel_initializer=tf.keras.initializers.HeNormal(seed=42)),\
                                    pruning_schedule=pruning_schedule)(x) if quant_size==0\
        else tsk.prune_low_magnitude(QDense(latent_dim, kernel_initializer=tf.keras.initializers.HeNormal(seed=42),\
                kernel_quantizer='quantized_bits(' + str(16) + ',6,'+ str(symmetric) +', alpha=1)',\
                bias_quantizer='quantized_bits(' + str(16) + ',6,'+ str(symmetric) +', alpha=1)'),\
                                    pruning_schedule=pruning_schedule)(x)
    # Use reparameterization trick to ensure correct gradient
    z = Sampling()([mu, logvar])

    # Create encoder
    encoder = Model(inputArray, [mu, logvar, z], name='encoder')    
    encoder.summary()


    #decoder
    d_input = Input(shape=(latent_dim,), name='decoder_input')
    x = Dense(16, kernel_initializer=tf.keras.initializers.HeNormal(seed=42))(d_input)
    x = BatchNormalization()(x)
    #x = LeakyReLU(alpha=0.3)(x)
    x = Activation('relu')(x)
    x = Dense(32, kernel_initializer=tf.keras.initializers.HeNormal(seed=42))(x)    
    x = BatchNormalization()(x)
    #x = LeakyReLU(alpha=0.3)(x)
    x = Activation('relu')(x)
    dec = Dense(input_shape, kernel_initializer=tf.keras.initializers.HeNormal(seed=42))(x)
    # Create decoder
    decoder = Model(d_input, dec, name='decoder')
    decoder.summary()

    return encoder, decoder

Let's break down the function line by line for a detailed explanation:

##### **Function Signature**

In [None]:
def build_QVAE(input_shape, latent_dim, quant_size=12, integer=4, symmetric=0, pruning='pruned', batch_size=1024):

* `input_shape`: Shape of the input data.
* `latent_dim`: Dimensionality of the latent space.
* `quant_size`: Number of bits for quantization.
* `integer`: Number of integer bits for quantization.
* `symmetric`: Symmetry mode for quantization.
* `pruning`: Specifies if pruning should be applied.
* `batch_size`: Batch size for pruning schedule calculations.

<span style="color: red;"> Key question: What do all these parameters mean? </span>

##### **Setting Default Values**

In [None]:
    quant_size = 12
    integer = 4
    symmetric = 0
    pruning = 'pruned'

These lines reset the quantization and pruning parameters to default values.

##### **Pruning Configuration**

In [None]:
    if pruning == 'pruned':
        ''' How to estimate the enc step:
                num_samples = input_train.shape[0] * (1 - validation_split)
                end_step = np.ceil(num_samples / batch_size).astype(np.int32) * pruning_epochs
                so, stop pruning at the 7th epoch
        '''
        begin_step = np.ceil((input_shape*0.8)/batch_size).astype(np.int32)*5
        end_step = np.ceil((input_shape*0.8)/batch_size).astype(np.int32)*15
        print('Begin step: ' + str(begin_step) + ', End step: ' + str(end_step))
        
        pruning_schedule = tfmot.sparsity.keras.PolynomialDecay(
                                initial_sparsity=0.0, final_sparsity=0.5,
                                begin_step=begin_step, end_step=end_step)
        print(pruning_schedule.get_config())


* First, the condition checks if the `pruning` is set to `pruned`. If it is, the code block within the `if` is executed.
* The comments at the beginning of the `if` describe a general approach to estimate `begin_step` and `end_step` for pruning. The idea is to stop proning at the 7th epoch.
* This block calculates the pruning schedule. `begin_step` and `end_step` are calculated based on the input shape and batch size, and some constants (5 and 15 in this case).
* In the definition of `begin_step`, we are using the 80% of the input shape (it might represent 80% of the training data). `np.ceil(...)/batch_size` calculates the number of steps per epoch, and `astype(np.int32)*5` multiplies by 5 to estimate the step to begin prunning.
* `end_step`, similarly to `begin_step`, is calculated to be 15 times the number of steps per epoch.
* `tfmot.sparsity.keras.PolynomialDecay` defines how pruning progresses over time. It defines a polynomial decay schedule for pruning, sparsity increases gradually following a polynomial curve. `initial_sparsity=0.0` is the sparsity (fraction of zero weights )at the beginning of pruning, and `final_sparsity=0.5` is the target sparsity at the end of pruning. `begin_step` is the step at which pruning begins, and `end_step` at which it ends. This creates a schedule wgere sparsity increases from 0% to 50% in a polynomial manner from `begin_step` to `end_step`. The values `5` and `15` are chosen as multipliers to determine the pruning schedule's start and end points.
* The pruning schedule is printed for verification.

<span style="color: red;"> Key question: What does sparsity mean? </span>

##### **Encoder Definition**

In [None]:
    inputArray = Input(shape=(input_shape))
    x = QActivation(f'quantized_bits(16,10,0,alpha=1)')(inputArray)
    x = QBatchNormalization()(x)

* `inputArray`: Input layer for the encoder with the specified shape by `input_shape`.
* `QActivation`: Applies quantized activation. `QActivation(f'quantized_bits(16,10,0,alpha=1)')` applies quantized activation with 16 bits, 10 integer bits, and no symmetric range (`symmetric=0`).
* `QBatchNormalization`: Applies batch normalization with quantization.

In [None]:
    x = tsk.prune_low_magnitude(Dense(32, kernel_initializer=tf.keras.initializers.HeNormal(seed=42)),\
                                                pruning_schedule=pruning_schedule)(x) if quant_size==0\
        else tsk.prune_low_magnitude(QDense(32, kernel_initializer=tf.keras.initializers.HeNormal(seed=42),\
                kernel_quantizer='quantized_bits(' + str(quant_size) + ','+str(integer)+','+ str(symmetric) +'), alpha=1',\
                bias_quantizer='quantized_bits(' + str(quant_size) + ','+ str(integer) + ',' + str(symmetric) +', alpha=1)'),\
                                                pruning_schedule=pruning_schedule)(x)

* **Note that `\` in Python is used as a line continuation character. This allows you to split a long line of code into multiple lines for better readability without breaking the logic of the code. Here, we are using the `if` logic conditionales in a not-conventional way.**
* Adds a dense (fully connected) layer with optional quantization and pruning.
* The code block above refers to the first dense layer with conditional pruning and quantization. If `quant_size == 0`. 
    * It adds a dense layer with 32 units and HeNormal initializer, wrapped in pruning logic (it would be pruned using the `pruning_schedule`). Then, it specifies the pruning schedule and applies the layer to `x` if `quant_size == 0`.
    * The `else` part continues the line, defining the alternative case using a quantized dense layer`QDense` (if `quant_size` is not 0), with specfied kernel and bias quantizers.

In [None]:
    x = tsk.prune_low_magnitude(QBatchNormalization(), pruning_schedule=pruning_schedule)(x)
    x = tsk.prune_low_magnitude(Activation('relu'),pruning_schedule=pruning_schedule)(x) if quant_size==0\
        else tsk.prune_low_magnitude(QActivation('quantized_relu(bits=' + str(quant_size) + ')'),pruning_schedule=pruning_schedule)(x)

* Adds batch normalization and activation layers with optional quantization and pruning.
* **The first line** applies a Quantized Batch Normalization layer with pruning to `x`. Note that there is no backslash here as the statement fits on a single line.
* **The second line**  consists in add Activation Layer with Conditional Pruning and Quantization. Until the `\`, adds a `ReLU` activation, using `Activation` if `quant_size == 0`. If not, `QActivation` will be used.
* The pruning schedule `pruning_schedule` ensures that layers are pruned according to a defined sparsity pattern.
* Layers are organized sequantially, with ecah layer's output serving as the input to the next layer.

In [None]:
    x = tsk.prune_low_magnitude(Dense(16, kernel_initializer=tf.keras.initializers.HeNormal(seed=42)),\
                                            pruning_schedule=pruning_schedule)(x) if quant_size==0\
        else tsk.prune_low_magnitude(QDense(16, kernel_initializer=tf.keras.initializers.HeNormal(seed=42),\
                kernel_quantizer='quantized_bits(' + str(quant_size) + ','+str(integer)+','+ str(symmetric) +', alpha=1)',\
                bias_quantizer='quantized_bits(' + str(quant_size) + ','+ str(integer) + ',' + str(symmetric) +', alpha=1)'),\
                                    pruning_schedule=pruning_schedule)(x)

* The code block below refers to a **Second Dense Layer**. It adds another dense layer with optional quantization and pruning. It is similar to the previous dense layer, but with 16 units.

In [None]:
    x = tsk.prune_low_magnitude(QBatchNormalization(), pruning_schedule=pruning_schedule)(x)
    x = tsk.prune_low_magnitude(Activation('relu'),pruning_schedule=pruning_schedule)(x) if quant_size==0\
        else tsk.prune_low_magnitude(QActivation('quantized_relu(bits=' + str(quant_size) + ')'),\
                                    pruning_schedule=pruning_schedule)(x)

* Adds batch normalization and activation layers with optional quantization and pruning. It is the same code block as the previous one about `QBatchNormalization` and `Activation`.

##### **Latent Space**

In [None]:
    mu = tsk.prune_low_magnitude(Dense(latent_dim, name = 'latent_mu', kernel_initializer=tf.keras.initializers.HeNormal(seed=42)))(x) if quant_size==0\
        else tsk.prune_low_magnitude(QDense(latent_dim, kernel_initializer=tf.keras.initializers.HeNormal(seed=42),\
                kernel_quantizer='quantized_bits(' + str(16) + ',6,'+ str(symmetric) +', alpha=1)',\
                bias_quantizer='quantized_bits(' + str(16) + ',6,'+ str(symmetric) +', alpha=1)'),\
                                    pruning_schedule=pruning_schedule)(x)
    logvar = tsk.prune_low_magnitude(Dense(latent_dim, name = 'latent_logvar', kernel_initializer=tf.keras.initializers.HeNormal(seed=42)),\
                                    pruning_schedule=pruning_schedule)(x) if quant_size==0\
        else tsk.prune_low_magnitude(QDense(latent_dim, kernel_initializer=tf.keras.initializers.HeNormal(seed=42),\
                kernel_quantizer='quantized_bits(' + str(16) + ',6,'+ str(symmetric) +', alpha=1)',\
                bias_quantizer='quantized_bits(' + str(16) + ',6,'+ str(symmetric) +', alpha=1)'),\
                                    pruning_schedule=pruning_schedule)(x)

* `mu`: Mean of the latent space distribution.
* `logvar`: Log variance of the latent space distribution.
* Both are optionally quantized and pruned.
* The code block above creates two dense layers for the latent space representation: one for the mean (`mu`) and one for the log variance (`logvar`). It Uses quantized dense layers (`QDense`) if `quanti_size` is not `0`, with specific quantizers.

In [None]:
    z = Sampling()([mu, logvar])

* `z`: Latent variable sampled using the reparameterization trick.
* The `Sampling` layer takes `mu` and `logvar` as input and outputs a sampled latent vector `z`.
* Note that in a VAE, we aim to encode input data into a latent space, and then decode it back to reconstruct the input. The VAE introduces stochasticity (randomness) in the encoding process, allowing it to learn a smooth latent space.
* `mu` (the mean vector of the Gaussian distribution in the latent space) and `logvar` (The logarithm of the variance vector of the Gaussian distribution in the latent space) are outputs of the encoder netowrk. They represent the parameters of a Gaussian distribution. 
* The `Sampling` layer is a custom layer that uses the reparameterization trick to sample points from the Gaussian distribution defined by `mu` and `logvar`.
* Geometrically, `mu` represents the center of the Gaussian distribution in the latent space and `logvar` determines the spread or dispersion around `mu`.
* Directly sampling from a Gaussian distribution in the latent space could lead to problems in gradient computation during backpropagation. To address this, the **Reparameterization Trick** is used, which introduces a deterministic component (`mu`) and a stochastic component (sampled from a standard normal distribution).
* `z` represents a point in the latent space that follows a Gaussian distribution. It encodes the information of the input data in a compact form. The stochastic nature of `z` allows the VAE to generate diverse samples from the learned distribution, enhancing its generative capabilities.
* **Geometrically**, the encoder maps input data to a point `mu` in the latent space, with an associated spread determined by `logvar`. The `Sampling` layer then perturbs this point by adding a scaled random vector, resulting in `z`. This approach ensures that the latent space is populated in a way that captures the variability of the input data, allowing the decoder to reconstruct inputs from diverse latent codes.
* `z = Sampling()([mu, logvar])` is a crucial step in the VAE pipeline that introduces controlled randomness into the latent space representation, facilitating effective training and meaningful data generation.

In [None]:
    encoder = Model(inputArray, [mu, logvar, z], name='encoder')    
    encoder.summary()

* Defines and summarizes the encoder model. The model was defined with `InputArray` as input and `[mu, logvar, z]` as output.

##### **Decoder Definition**

In [None]:
    d_input = Input(shape=(latent_dim,), name='decoder_input')
    x = Dense(16, kernel_initializer=tf.keras.initializers.HeNormal(seed=42))(d_input)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dense(32, kernel_initializer=tf.keras.initializers.HeNormal(seed=42))(x)    
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    dec = Dense(input_shape, kernel_initializer=tf.keras.initializers.HeNormal(seed=42))(x)

* `d_input`: Input layer for the decoder. It takes in a tensor of shape `latent_dim`. This tensor is the latent vector `z` sampled from the encoder.
* Several dense layers with batch normalization and activation.
* **The first dense layer** (`Dense(16)`) creates a dense (fully connected) layer with 16 units. The number of units is a hyperparameter that can be tuned. Then, `kernel_initializer=...` sets the weights of the dense layer according to the `HeNormal` initialization method, suitable for layers with `ReLU` activation functions. The `seed=42` ensures reproducibility.
* **The first Batch Normalization** normalizes the outputs of the previous dense layer, improving training stability and speed.
* **The first activation layer** applies the `ReLU` activation function, introducing non-linearity to the model and allowing it to learn more complex patterns.
* **The second dense layer** creates another dense layer, now with 32 units and use again the HeNormal initializer and the seed.
* Then, `BatchNormalization` and `Activation` are applied again. 
* **The final dense layer** creates a layer with the same number of units as the original input shape. This layer maps the latent space representation back to the original input space dimensions. As before, `HeNormal` initializer sets the weights.
* `dec`: Output layer of the decoder.

In [None]:
    decoder = Model(d_input, dec, name='decoder')
    decoder.summary()

* Defines and summarizes the decoder model.
* `Model(d_input, dec, name='decoder')` defines the decoder model, specifying `d_input` as the input and `dec` as the output. The name of the model is set to `decoder`.
* `decoder.summary()` prints a summary of the decoder model, including the layers, their output shapes, and the number of parameters.

##### **Return Models**

In [None]:
    return encoder, decoder

* Returns the encoder and decoder models.
* In summary, it is good to know the **Geometric Interpetation** of how the decoder works.
    * ***Input Layer*** (`d_input`):
        * The decoder starts with the latent vector z (of shape `(latent_dim,)`), representing a point in the latent space. The latent vector `z` lives in a space with dimensions defined by `latent_dim`. The `decoder` will process the `z` vector through several layers to transform it back to the original input space, producing an output with shape `(input_shape,)`.
        * The latent space is a `latent_dim`-dimensional. This is typically much lower-dimensional than the original input space. The purpose of this dimensionality reduction is to capture the most important features of the input data in a compact representation.
    * ***Dense and Activation Layers***:
        * The first dense layer maps the latent vector to a 16-dimensional space, followed by `ReLU` activation.
        * This transformation can be seen as the decoder learning to expand and interpret the compact representation (latent vector) into a higher-dimensional space.
        * The second dense layer further maps this 16-dimensional representation to a 32-dimensional space, again followed by `ReLU` activation.
    * ***Final Dense Layer***:
        * The final dense layer maps the 32-dimensional representation back to the original input shape (the original input space is `input_shape`-dimensional, it is the space where the input data resides), effectively reconstructing the input data from the latent representation.
* **What is the importance of the Decoder?**

The decoder in this VAE setup takes the latent representation, gradually transforms it through several dense and activation layers, and finally reconstructs the input data. This process is essential for both learning a meaningful latent space and for generating new data samples.

    * ***Reconstruction***
        * The primary role of the decoder is to take the latent representation `z` and reconstruct it back to the original input space.
    * ***Generative Capability***
        * Because the decoder can reconstruct data from any point in the latent space, it can be used to generate new data by sampling from the latent space.
    * ***Smoothness an Continuity***
        * The structure of the decoder ensures that small changes in the latent space correspond to small changes in the reconstructed output, making the latent space smooth and continuous.