In [2]:
import torch
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from fedlern.models.mlp import MLP
from fedlern.models.resnet_v2 import ResNet18
from fedlern.train_utils import *
from fedlern.quant_utils import *
import fedlern.utils as utils
import torch.nn.functional as F

In [3]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
cuda_device = torch.device("cuda:0")
cpu_device = torch.device("cpu:0")

#stats = ((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
stats = (0.49139968, 0.48215841, 0.44653091), (0.24703223, 0.24348513, 0.26158784)
batch_size = 100

In [4]:
train_loader, test_loader = prepare_dataloader_cifar(num_workers=8, train_batch_size=batch_size, eval_batch_size=batch_size, stats=stats)


Files already downloaded and verified


# RESNET no quantization

In [5]:
# Test the model

model = ResNet18()
load_model(model, './saved_models/resnet18_cifar10_92-5.pt', device)

loss, acc = evaluate_model(model, test_loader, device,)
print(f'Loss: {loss}, Accuracy: {acc*100}%')
print_model_size(model)


fp32_cpu_inference_latency = measure_inference_latency(model=model, device=cpu_device, input_size=(1,3,32,32), num_samples=100)
fp32_gpu_inference_latency = measure_inference_latency(model=model, device=cuda_device, input_size=(1,3,32,32), num_samples=100)
print("FP32 CPU Inference Latency: {:.2f} ms / sample".format(fp32_cpu_inference_latency * 1000))
print("FP32 CUDA Inference Latency: {:.2f} ms / sample".format(fp32_gpu_inference_latency * 1000))




Loss: 0.0, Accuracy: 92.49999237060547%
44.77 MB
FP32 CPU Inference Latency: 6.57 ms / sample
FP32 CUDA Inference Latency: 4.21 ms / sample


In [6]:
# Load the model
model = ResNet18()
load_model(model, './saved_models/resnet18_cifar10_92-2.pt', device)

# Test the model
loss, acc = evaluate_model(model, test_loader, device,)
print(f'Loss: {loss}, Accuracy: {acc*100}%')
print_model_size(model)

# Measure inference latency
fp32_cpu_inference_latency = measure_inference_latency(model=model, device=cpu_device, input_size=(1,3,32,32), num_samples=100)
fp32_gpu_inference_latency = measure_inference_latency(model=model, device=cuda_device, input_size=(1,3,32,32), num_samples=100)
print("FP32 CPU Inference Latency: {:.2f} ms / sample".format(fp32_cpu_inference_latency * 1000))
print("FP32 CUDA Inference Latency: {:.2f} ms / sample".format(fp32_gpu_inference_latency * 1000))


Loss: 0.0, Accuracy: 92.24999237060547%
44.77 MB
FP32 CPU Inference Latency: 9.38 ms / sample
FP32 CUDA Inference Latency: 3.97 ms / sample


In [7]:
# Load the model
model = ResNet18()
load_model(model, './saved_models/resnet18v2_cifar10.pt', device)

# Test the model
loss, acc = evaluate_model(model, test_loader, device,)
print(f'Loss: {loss}, Accuracy: {acc*100}%')
print_model_size(model)

# Measure inference latency
fp32_cpu_inference_latency = measure_inference_latency(model=model, device=cpu_device, input_size=(1,3,32,32), num_samples=100)
fp32_gpu_inference_latency = measure_inference_latency(model=model, device=cuda_device, input_size=(1,3,32,32), num_samples=100)
print("CPU Inference Latency: {:.2f} ms / sample".format(fp32_cpu_inference_latency * 1000))
print("CUDA Inference Latency: {:.2f} ms / sample".format(fp32_gpu_inference_latency * 1000))


Loss: 0.0, Accuracy: 92.67999267578125%
44.77 MB
CPU Inference Latency: 9.94 ms / sample
CUDA Inference Latency: 4.32 ms / sample


# RESNET Quantization 4 bits

In [8]:
# Load the model
model = ResNet18()
load_model(model, './saved_models/resnet_4bits_2023-06-17_19-33.pt', device)

# Test the model
loss, acc = evaluate_model(model, test_loader, device,)
print(f'Loss: {loss}, Accuracy: {acc*100}%')
print_model_size(model)

# Measure inference latency
cpu_inference_latency = measure_inference_latency(model=model, device=cpu_device, input_size=(1,3,32,32), num_samples=100)
gpu_inference_latency = measure_inference_latency(model=model, device=cuda_device, input_size=(1,3,32,32), num_samples=100)
print("CPU Inference Latency: {:.2f} ms / sample".format(fp32_cpu_inference_latency * 1000))
print("CUDA Inference Latency: {:.2f} ms / sample".format(fp32_gpu_inference_latency * 1000))

Loss: 0.0, Accuracy: 79.60999298095703%
44.77 MB
CPU Inference Latency: 9.94 ms / sample
CUDA Inference Latency: 4.32 ms / sample


# Quantization 8 bits

In [9]:
# Load the model
model = ResNet18()
load_model(model, './saved_models/resnet_8bits_2023-06-17_16-05.pt', device)

# Test the model
loss, acc = evaluate_model(model, test_loader, device,)
print(f'Loss: {loss}, Accuracy: {acc*100}%')
print_model_size(model)

# Measure inference latency
cpu_inference_latency = measure_inference_latency(model=model, device=cpu_device, input_size=(1,3,32,32), num_samples=100)
gpu_inference_latency = measure_inference_latency(model=model, device=cuda_device, input_size=(1,3,32,32), num_samples=100)
print("FP32 CPU Inference Latency: {:.2f} ms / sample".format(fp32_cpu_inference_latency * 1000))
print("FP32 CUDA Inference Latency: {:.2f} ms / sample".format(fp32_gpu_inference_latency * 1000))

Loss: 0.0, Accuracy: 83.66000366210938%
44.77 MB
FP32 CPU Inference Latency: 9.94 ms / sample
FP32 CUDA Inference Latency: 4.32 ms / sample


In [10]:
param = [ 28 * 28, # input
                512, 256, 128, 64,
                10 ] #output

transform = transforms.Compose([
    transforms.ToTensor(), # convert the image to a PyTorch tensor
    transforms.Normalize((0.5,), (0.5,)) # normalize the image with mean=0.5 and std=0.5
])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
test_dataset = datasets.MNIST(root='data/', train=False, transform=transform, download=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=100, shuffle=False)



In [11]:
# Loading pretrained model
modeldict = torch.load('saved_models/mlp.ckpt')
model = MLP(param)
model.load_state_dict(modeldict)

q_dict = torch.load('saved_models/mlp_dynamicq.ckpt')
print(q_dict.keys())
model_qd = MLP(param)
model_qd.load_state_dict(q_dict)

loss, acc = utils.evaluate_model(model, test_loader, device)
print(f'Loss: {loss}, Accuracy: {acc*100}%')

# model.to(device)
# quantized_model.eval()
# model.eval()
# with torch.no_grad():
#     correctq = 0
#     totalq = 0
#     total = 0
#     correct = 0
#     for images, labels in test_loader:
#         images_cuda = images.to(device)
#         labels_cuda = labels.to(device)

#         outputsq = quantized_model(images)
#         _, predictedq = torch.max(outputsq.data, 1)
#         totalq += labels.size(0)
#         correctq += (predictedq == labels).sum().item()
        
#         outputs = model(images_cuda)
#         _, predicted = torch.max(outputs.data, 1)
#         total += labels_cuda.size(0)
#         correct += (predicted == labels_cuda).sum().item()
        

#     print('Accuracy of the quantized model on the test images: {} %'.format(100 * correctq / totalq))
#     print('Accuracy of the model on the test images: {} %'.format(100 * correct / total))




utils.print_model_size(model)
utils.print_model_size(model_qd)

odict_keys(['linears.0.scale', 'linears.0.zero_point', 'linears.0._packed_params.dtype', 'linears.0._packed_params._packed_params', 'linears.1.scale', 'linears.1.zero_point', 'linears.1._packed_params.dtype', 'linears.1._packed_params._packed_params', 'linears.2.scale', 'linears.2.zero_point', 'linears.2._packed_params.dtype', 'linears.2._packed_params._packed_params', 'linears.3.scale', 'linears.3.zero_point', 'linears.3._packed_params.dtype', 'linears.3._packed_params._packed_params', 'linears.4.scale', 'linears.4.zero_point', 'linears.4._packed_params.dtype', 'linears.4._packed_params._packed_params'])


  device=storage.device,


RuntimeError: Error(s) in loading state_dict for MLP:
	Missing key(s) in state_dict: "linears.0.weight", "linears.0.bias", "linears.1.weight", "linears.1.bias", "linears.2.weight", "linears.2.bias", "linears.3.weight", "linears.3.bias", "linears.4.weight", "linears.4.bias". 
	Unexpected key(s) in state_dict: "linears.0.scale", "linears.0.zero_point", "linears.0._packed_params.dtype", "linears.0._packed_params._packed_params", "linears.1.scale", "linears.1.zero_point", "linears.1._packed_params.dtype", "linears.1._packed_params._packed_params", "linears.2.scale", "linears.2.zero_point", "linears.2._packed_params.dtype", "linears.2._packed_params._packed_params", "linears.3.scale", "linears.3.zero_point", "linears.3._packed_params.dtype", "linears.3._packed_params._packed_params", "linears.4.scale", "linears.4.zero_point", "linears.4._packed_params.dtype", "linears.4._packed_params._packed_params". 