Jupyter Notebook to show the difference in performance between the original model and the model obtained with modifications for quantization. 

Making the channels divisible by 8 greatly reduced the inference time on both GPU and CPU.

In [None]:
import torch
from torchinfo import summary

import os
import time
import numpy as np
import random

import phinet_quant
import micromind

In [None]:
def print_size_of_model(model):
    """
    Print the size of the model.
    """
    torch.save(model.state_dict(), "temp.p")
    size_in_bytes = os.path.getsize("temp.p")
    if size_in_bytes < 1048576:
        size_in_kb = size_in_bytes / 1024
        print("{:.3f} KB".format(size_in_kb))
    else:
        size_in_mb = size_in_bytes / 1048576
        print("{:.3f} MB".format(size_in_mb))
    os.remove('temp.p')


def measure_inference_latency(model, input_shape, device = None, repetitions=100, warmup_it = 10):
    """
    Measures the inference time of the provided neural network model.

    Args:
        model: The neural network model to evaluate.
        input_shape: The shape of the input data expected by the model.

    Returns:
        tuple: A tuple containing the mean and standard deviation of the inference time
               measured in milliseconds.
    """
    if device is None:
        device = next(model.parameters()).device.type  # Get the device where the model is located
    
    dummy_input = torch.randn(1, *input_shape, dtype=torch.float).to(device)
    
    # Set model to evaluation mode
    model.to(device)
    model.eval()
    
    # GPU warm-up
    for _ in range(warmup_it):
        _ = model(dummy_input)

    # Measure inference time
    timings = []
    with torch.no_grad():
        if device == 'cuda':
            starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
            for rep in range(repetitions):
                starter.record()
                _ = model(dummy_input)
                ender.record()
                torch.cuda.synchronize()
                curr_time = starter.elapsed_time(ender)
                timings.append(curr_time)
        else:  # CPU
            for rep in range(repetitions):
                start_time = time.time()
                _ = model(dummy_input)
                end_time = time.time()
                elapsed_time = (end_time - start_time) * 1000.0  # Convert to milliseconds
                timings.append(elapsed_time)

    # Calculate mean and std
    mean_time = np.mean(timings)
    std_time = np.std(timings)

    return mean_time, std_time

def set_random_seeds(random_seed=0):
    """
    Set all random seeds to a fixed value to make results reproducible.
    """

    torch.manual_seed(random_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)

# Orginal PhiNet VS Modified PhiNet

Comparison of the original PhiNet model and the model modified to make quantization effective (channels divisible by 8 and `padding` within `Conv2d`). The following values are compared:
- Number of parameters;
- Number of operations;
- Inference time;

**The comparison is made with both models in FP32**.

## Large Model
`input_shape = [3, 224, 224], num_layers = 7, alpha = 3, beta = 0.75, t_zero = 6, include_top = True, num_classes = 10`

In [None]:
# Orginal PhiNet
model = micromind.PhiNet(input_shape = [3, 224, 224], num_layers = 7, alpha = 3, beta = 0.75, t_zero = 6, include_top = True, num_classes = 10)
# Modified PhiNet
model_quant = phinet_quant.PhiNet(input_shape = [3, 224, 224], num_layers = 7, alpha = 3, beta = 0.75, t_zero = 6, include_top = True, num_classes = 10, divisor=8)

# Print the models
print(f"Origina model:\n{model}\n")
print(f"Modified model:\n{model_quant}")

# Get the summary of the models
org_model = summary(model, input_size=(1, 3, 224, 224), verbose=0)
mod_model = summary(model_quant, input_size=(1, 3, 224, 224), verbose=0)

## Small models
`input_shape = [3, 224, 224], num_layers = 7, alpha = 2.3, beta = 0.75,t_zero = 5, include_top = True, num_classes = 10`

In [None]:
# Orginal PhiNet
model_S = micromind.PhiNet(input_shape = [3, 224, 224], num_layers=7, alpha= 2.3, beta= 0.75,t_zero= 5, include_top= True, num_classes=10)
# Modified PhiNet
model_quant_S = phinet_quant.PhiNet(input_shape = [3, 224, 224], num_layers=7, alpha= 2.3, beta= 0.75,t_zero= 5, include_top= True, num_classes=10, divisor=8)

# Print the models
print(f"Original model:\n{model_S}\n")
print(f"Modified model:\n{model_quant_S}\n")

# Get the summary of the models
org_model_S = summary(model_S, input_size=(1, 3, 224, 224), verbose=0)
mod_model_S = summary(model_quant_S, input_size=(1, 3, 224, 224), verbose=0)

## Results

In [None]:
model_gpu_inference_latency = measure_inference_latency(model, device="cuda", input_shape=(3, 224, 224))
model_quant_gpu_inference_latency = measure_inference_latency(model_quant, device="cuda", input_shape=(3, 224, 224))

model_cpu_inference_latency = measure_inference_latency(model, device="cpu", input_shape=(3, 224, 224))
model_quant_cpu_inference_latency = measure_inference_latency(model_quant, device="cpu", input_shape=(3, 224, 224))

print("LARGE MODEL")
print(f"input_shape: {model.input_shape}, num_layers: {model.num_layers}, alpha: {model.alpha}, beta: {model.beta}, t_zero: {model.t_zero}\n")

print("Number of parameters in the original model: {:,}".format(org_model.total_params))
print("Number of MAC operations in the original model: {:,}".format(org_model.total_mult_adds))
print("Original model GPU Inference Latency: {:.3f} ms".format(model_gpu_inference_latency[0]))
print("Original model CPU Inference Latency: {:.3f} ms\n".format(model_cpu_inference_latency[0]))

print("Number of parameters in the modified model: {:,}".format(mod_model.total_params))
print("Number of MAC operations in the modified model: {:,}".format(mod_model.total_mult_adds))
print("Modified model GPU Inference Latency: {:.3f} ms".format(model_quant_gpu_inference_latency[0]))
print("Modified model CPU Inference Latency: {:.3f} ms\n".format(model_quant_cpu_inference_latency[0]))

print("SMALL MODEL")
print(f"input_shape: {model_S.input_shape}, num_layers: {model_S.num_layers}, alpha: {model_S.alpha}, beta: {model_S.beta}, t_zero: {model_S.t_zero}\n")

model_gpu_inference_latency = measure_inference_latency(model_S, device="cuda", input_shape=(3, 224, 224))
model_quant_gpu_inference_latency = measure_inference_latency(model_quant_S, device="cuda", input_shape=(3, 224, 224))

model_cpu_inference_latency = measure_inference_latency(model_S, device="cpu", input_shape=(3, 224, 224))
model_quant_cpu_inference_latency = measure_inference_latency(model_quant_S, device="cpu", input_shape=(3, 224, 224))

print("Number of parameters in the original model: {:,}".format(org_model_S.total_params))
print("Number of MAC operations in the original model: {:,}".format(org_model_S.total_mult_adds))
print("Original model GPU Inference Latency: {:.3f} ms".format(model_gpu_inference_latency[0]))
print("Original model CPU Inference Latency: {:.3f} ms\n".format(model_cpu_inference_latency[0]))

print("Number of parameters in the modified model: {:,}".format(mod_model_S.total_params))
print("Number of MAC operations in the modified model: {:,}".format(mod_model_S.total_mult_adds))
print("Modified model GPU Inference Latency: {:.3f} ms".format(model_quant_gpu_inference_latency[0]))
print("Modified model CPU Inference Latency: {:.3f} ms".format(model_quant_cpu_inference_latency[0]))


# Model performance

Compare the performance of the original model and the modified model by changing:
- Number of layers;
- $\alpha$;
- $\beta$;
- t<sub>0</sub>;

<img src="image.png" alt="Alt text" width=80% height=80%>


In [None]:
import numpy as np
import matplotlib.pyplot as plt

def plot(opt_values_x, opt_values_y, org_values_x, org_values_y, opt_label, org_label, x_label, y_label, title):
    plt.figure(figsize=(10, 6))
    plt.scatter(opt_values_x, opt_values_y, color='orange', label=opt_label)
    plt.scatter(org_values_x, org_values_y, color='blue', label=org_label)
    plt.legend()
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    plt.grid()
    plt.show()

def get_inference_data(num_layers = 7, alpha = 3, beta = 0.75, t_zero = 6):
    # Create an optimized model
    model_opt = phinet_quant.PhiNet(input_shape=[3, 224, 224], num_layers=num_layers, alpha=alpha, beta=beta, t_zero=t_zero, include_top=True, num_classes=10)
    mod_model = summary(model_opt, input_size=(1, 3, 224, 224), verbose=0)

    opt_inference_time_GPU = measure_inference_latency(model_opt, device="cuda", input_shape=(3, 224, 224))[0]
    opt_inference_time_CPU = measure_inference_latency(model_opt, device="cpu", input_shape=(3, 224, 224))[0]
    opt_num_params = mod_model.total_params
    opt_num_macs = mod_model.total_mult_adds

    # Create a non-optimized model
    model = micromind.PhiNet(input_shape=[3, 224, 224], num_layers=num_layers, alpha=alpha, beta=beta, t_zero=t_zero, include_top=True, num_classes=10)
    mod_model = summary(model, input_size=(1, 3, 224, 224), verbose=0)

    inference_time_GPU = measure_inference_latency(model, device="cuda", input_shape=(3, 224, 224))[0]
    inference_time_CPU = measure_inference_latency(model, device="cpu", input_shape=(3, 224, 224))[0]
    num_params = mod_model.total_params
    num_macs = mod_model.total_mult_adds

    return opt_inference_time_GPU, opt_inference_time_CPU, opt_num_params, opt_num_macs, inference_time_GPU, inference_time_CPU, num_params, num_macs


## Number of Layers

In [None]:
nr_layers = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

(opt_inference_time_list_GPU, opt_inference_time_list_CPU, opt_num_params_list, opt_num_macs_list,
 inference_time_list_GPU, inference_time_list_CPU, num_params_list, num_macs_list) = zip(*[get_inference_data(num_layers = layer) for layer in nr_layers])

# Plot inference_time_list_GPU vs num_params_list
plot(opt_num_params_list, 
                                        opt_inference_time_list_GPU, 
                                        num_params_list,
                                        inference_time_list_GPU,                                        
                                        "Optimized PhiNet", 
                                        "Original PhiNet",
                                        "Number of Parameters", 
                                        "GPU Inference Time (ms)", 
                                        "GPU Inference Time vs Number of Parameters"
)

# Plot inference_time_list_CPU vs num_params_list
plot(opt_num_params_list, 
                                        opt_inference_time_list_CPU, 
                                        num_params_list,
                                        inference_time_list_CPU,                                        
                                        "Optimized PhiNet", 
                                        "Original PhiNet",
                                        "Number of Parameters", 
                                        "CPU Inference Time (ms)", 
                                        "CPU Inference Time vs Number of Parameters"
)

# Plot inference_time_list_GPU vs num_macs_list
plot(opt_num_macs_list, 
                                        opt_inference_time_list_GPU, 
                                        num_macs_list,
                                        inference_time_list_GPU,                                        
                                        "Optimized PhiNet", 
                                        "Original PhiNet",
                                        "Number of MACs", 
                                        "GPU Inference Time (ms)", 
                                        "GPU Inference Time vs Number of MACs"
)

# Plot inference_time_list_CPU vs num_macs_list
plot(opt_num_macs_list, 
                                        opt_inference_time_list_CPU, 
                                        num_macs_list,
                                        inference_time_list_CPU,                                        
                                        "Optimized PhiNet", 
                                        "Original PhiNet",
                                        "Number of MACs", 
                                        "CPU Inference Time (ms)", 
                                        "CPU Inference Time vs Number of MACs"
)


## Alpha

In [None]:
alphas = np.linspace(0.25, 10, 15)

(opt_inference_time_list_GPU, opt_inference_time_list_CPU, opt_num_params_list, opt_num_macs_list,
 inference_time_list_GPU, inference_time_list_CPU, num_params_list, num_macs_list) = zip(*[get_inference_data(alpha = alpha) for alpha in alphas])

# Plot inference_time_list_GPU vs num_params_list
plot(opt_num_params_list, 
                                        opt_inference_time_list_GPU, 
                                        num_params_list,
                                        inference_time_list_GPU,                                        
                                        "Optimized PhiNet", 
                                        "Original PhiNet",
                                        "Number of Parameters", 
                                        "GPU Inference Time (ms)", 
                                        "GPU Inference Time vs Number of Parameters"
)

# Plot inference_time_list_CPU vs num_params_list
plot(opt_num_params_list, 
                                        opt_inference_time_list_CPU, 
                                        num_params_list,
                                        inference_time_list_CPU,                                        
                                        "Optimized PhiNet", 
                                        "Original PhiNet",
                                        "Number of Parameters", 
                                        "CPU Inference Time (ms)", 
                                        "CPU Inference Time vs Number of Parameters"
)

# Plot inference_time_list_GPU vs num_macs_list
plot(opt_num_macs_list, 
                                        opt_inference_time_list_GPU, 
                                        num_macs_list,
                                        inference_time_list_GPU,                                        
                                        "Optimized PhiNet", 
                                        "Original PhiNet",
                                        "Number of MACs", 
                                        "GPU Inference Time (ms)", 
                                        "GPU Inference Time vs Number of MACs"
)

# Plot inference_time_list_CPU vs num_macs_list
plot(opt_num_macs_list, 
                                        opt_inference_time_list_CPU, 
                                        num_macs_list,
                                        inference_time_list_CPU,                                        
                                        "Optimized PhiNet", 
                                        "Original PhiNet",
                                        "Number of MACs", 
                                        "CPU Inference Time (ms)", 
                                        "CPU Inference Time vs Number of MACs"
)


## Beta

In [None]:
betas = np.linspace(0.25, 1, 15)

(opt_inference_time_list_GPU, opt_inference_time_list_CPU, opt_num_params_list, opt_num_macs_list,
 inference_time_list_GPU, inference_time_list_CPU, num_params_list, num_macs_list) = zip(*[get_inference_data(beta = beta) for beta in betas])

# Plot inference_time_list_GPU vs num_params_list
plot(opt_num_params_list, 
                                        opt_inference_time_list_GPU, 
                                        num_params_list,
                                        inference_time_list_GPU,                                        
                                        "Optimized PhiNet", 
                                        "Original PhiNet",
                                        "Number of Parameters", 
                                        "GPU Inference Time (ms)", 
                                        "GPU Inference Time vs Number of Parameters"
)

# Plot inference_time_list_CPU vs num_params_list
plot(opt_num_params_list, 
                                        opt_inference_time_list_CPU, 
                                        num_params_list,
                                        inference_time_list_CPU,                                        
                                        "Optimized PhiNet", 
                                        "Original PhiNet",
                                        "Number of Parameters", 
                                        "CPU Inference Time (ms)", 
                                        "CPU Inference Time vs Number of Parameters"
)

# Plot inference_time_list_GPU vs num_macs_list
plot(opt_num_macs_list, 
                                        opt_inference_time_list_GPU, 
                                        num_macs_list,
                                        inference_time_list_GPU,                                        
                                        "Optimized PhiNet", 
                                        "Original PhiNet",
                                        "Number of MACs", 
                                        "GPU Inference Time (ms)", 
                                        "GPU Inference Time vs Number of MACs"
)

# Plot inference_time_list_CPU vs num_macs_list
plot(opt_num_macs_list, 
                                        opt_inference_time_list_CPU, 
                                        num_macs_list,
                                        inference_time_list_CPU,                                        
                                        "Optimized PhiNet", 
                                        "Original PhiNet",
                                        "Number of MACs", 
                                        "CPU Inference Time (ms)", 
                                        "CPU Inference Time vs Number of MACs"
)


## t<sub>0</sub>

In [None]:
t_zeros = np.linspace(1, 8, 7)

(opt_inference_time_list_GPU, opt_inference_time_list_CPU, opt_num_params_list, opt_num_macs_list,
 inference_time_list_GPU, inference_time_list_CPU, num_params_list, num_macs_list) = zip(*[get_inference_data(t_zero= t_zero) for t_zero in t_zeros])

# Plot inference_time_list_GPU vs num_params_list
plot(opt_num_params_list, 
                                        opt_inference_time_list_GPU, 
                                        num_params_list,
                                        inference_time_list_GPU,                                        
                                        "Optimized PhiNet", 
                                        "Original PhiNet",
                                        "Number of Parameters", 
                                        "GPU Inference Time (ms)", 
                                        "GPU Inference Time vs Number of Parameters"
)

# Plot inference_time_list_CPU vs num_params_list
plot(opt_num_params_list, 
                                        opt_inference_time_list_CPU, 
                                        num_params_list,
                                        inference_time_list_CPU,                                        
                                        "Optimized PhiNet", 
                                        "Original PhiNet",
                                        "Number of Parameters", 
                                        "CPU Inference Time (ms)", 
                                        "CPU Inference Time vs Number of Parameters"
)

# Plot inference_time_list_GPU vs num_macs_list
plot(opt_num_macs_list, 
                                        opt_inference_time_list_GPU, 
                                        num_macs_list,
                                        inference_time_list_GPU,                                        
                                        "Optimized PhiNet", 
                                        "Original PhiNet",
                                        "Number of MACs", 
                                        "GPU Inference Time (ms)", 
                                        "GPU Inference Time vs Number of MACs"
)

# Plot inference_time_list_CPU vs num_macs_list
plot(opt_num_macs_list, 
                                        opt_inference_time_list_CPU, 
                                        num_macs_list,
                                        inference_time_list_CPU,                                        
                                        "Optimized PhiNet", 
                                        "Original PhiNet",
                                        "Number of MACs", 
                                        "CPU Inference Time (ms)", 
                                        "CPU Inference Time vs Number of MACs"
)
