Fine-Tuning and Quantization of EfficientNet for Food Type Detection Using Food-101 Dataset

In [28]:
import os, tempfile, torch, torch.nn as nn, torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models
from torch.quantization import quantize_dynamic

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# MODEL --------------------------------------------------------------------
# Download pre-trained model
model = models.mobilenet_v3_large(weights=models.MobileNet_V3_Large_Weights.DEFAULT)

#model.classifier[1] = nn.Linear(model.last_channel, 10)

# Modify input layer to accept 1 channel instead of 3
model.features[0][0] = nn.Conv2d(
    in_channels=1,
    out_channels=model.features[0][0].out_channels,
    kernel_size=model.features[0][0].kernel_size,
    stride=model.features[0][0].stride,
    padding=model.features[0][0].padding,
    bias=False
)

# Modify output layer to output 10 classes
in_features = model.classifier[3].in_features
model.classifier[3] = nn.Linear(in_features, 10)

model.to(device)
torch.save(model.state_dict(), "pre-trainedmodel-without-finetuning.pth")

# DATA for Fashion ---------------------------------------------------------------------
'''
trf = transforms.Compose([
    transforms.Resize(224),
    transforms.Grayscale(num_output_channels=3),
    transforms.ToTensor(),
    transforms.Normalize((0.2860,), (0.3530,))
])


train_set = datasets.FashionMNIST(root=".", train=True, download=True, transform=trf)
test_set  = datasets.FashionMNIST(root=".", train=False, download=True, transform=trf)

train_loader = DataLoader(train_set, batch_size=64, shuffle=True,  num_workers=4, pin_memory=True)
test_loader  = DataLoader(test_set,  batch_size=256,shuffle=False, num_workers=4, pin_memory=True)

'''

from torchvision.datasets import FashionMNIST
from torch.utils.data import DataLoader

transform = transforms.Compose([
    transforms.Resize(224),       # MobileNetV3 expects 224x224
    transforms.ToTensor(),        # Keeps 1 channel
    transforms.Normalize((0.5,), (0.5,))  # Normalize 1 channel
])


train_dataset = FashionMNIST(root='./data', train=True, transform=transform, download=True)
test_dataset = FashionMNIST(root='./data', train=False, transform=transform, download=True)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, num_workers=2)


# EVALUATION ---------------------------------------------------------------
@torch.no_grad()
def accuracy(net):
    net.eval()
    hits = total = 0
    for X, y in test_loader:
        X, y = X.to(device, non_blocking=True), y.to(device, non_blocking=True)
        hits  += (net(X).argmax(1) == y).sum().item()
        total += y.size(0)
    return hits / total * 100

# measuring accuracy of pretrained model without fine tuning on test dataset for chosen application of cloth type detector
fp32_acc_no_fine = accuracy(model)
print(f"Test Accuracy of pre-trained model without any fine tuning: {fp32_acc_no_fine:5.2f}%")

# FINETUNing the pre-trained model with selected training dataset for given application-----------------------------------------------------------------
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

EPOCHS = 1
for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0
    for X, y in train_loader:
        X, y = X.to(device, non_blocking=True), y.to(device, non_blocking=True)
        optimizer.zero_grad()
        loss = criterion(model(X), y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch+1}/{EPOCHS} | loss {running_loss/len(train_loader):.4f}")

torch.save(model.state_dict(), "pre-trained-model-with-finetuning.pth")


fp32_acc = accuracy(model)
print(f"Test accuracy of pre-trained model after fine-tuning and before quantization : {fp32_acc:5.2f}%")

def model_size_mb(net, fname):
    torch.save(net.state_dict(), fname)
    return os.path.getsize(fname) / 1_000_000

tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pt")
fp32_size = model_size_mb(model, tmp.name)
print(f"Model size before qunatization: {fp32_size:5.2f} MB")
tmp.close()

# QUANTISATION -------------------------------------------------------------
quantised = quantize_dynamic(
    model.cpu(),
    {nn.Linear, nn.Conv2d},
    dtype=torch.qint8
).to(device)
quantised.eval()


def accuracy1(net):
    net.eval()
    hits, total = 0, 0
    with torch.no_grad():
        for X, y in test_loader:
            X, y = X.to('cpu'), y.to('cpu')
            hits += (net(X).argmax(1) == y).sum().item()
            total += y.size(0)
    return hits / total * 100

torch.save(model.state_dict(), "quantized-pre-trainedmodel-with-finetuning.pth")

int8_acc = accuracy1(quantised.to('cpu'))
print(f"Test accuracy of pre-trained model after fine-tuning and after quantization : {int8_acc:5.2f}%")

tmp_q = tempfile.NamedTemporaryFile(delete=False, suffix=".pt")
int8_size = model_size_mb(quantised.cpu(), tmp_q.name)
print(f"Model size after INT8 qunatization: {int8_size:5.2f} MB")
tmp_q.close()

print(f"\nMemory saving : {(1 - int8_size / fp32_size) * 100:4.1f}%")
print(f"Accuracy drop : {fp32_acc - int8_acc:4.2f} percentage points")


Downloading: "https://download.pytorch.org/models/mobilenet_v3_large-5c1a4163.pth" to /root/.cache/torch/hub/checkpoints/mobilenet_v3_large-5c1a4163.pth
100%|██████████| 21.1M/21.1M [00:00<00:00, 81.5MB/s]
100%|██████████| 26.4M/26.4M [00:02<00:00, 11.6MB/s]
100%|██████████| 29.5k/29.5k [00:00<00:00, 167kB/s]
100%|██████████| 4.42M/4.42M [00:01<00:00, 3.08MB/s]
100%|██████████| 5.15k/5.15k [00:00<00:00, 13.4MB/s]


Test Accuracy of pre-trained model without any fine tuning: 10.25%
Epoch 1/1 | loss 0.3590
Test accuracy of pre-trained model after fine-tuning and before quantization : 90.17%
Model size before qunatization: 17.06 MB
Test accuracy of pre-trained model after fine-tuning and after quantization : 90.16%
Model size after INT8 qunatization: 13.34 MB

Memory saving : 21.8%
Accuracy drop : 0.01 percentage points


## Food Type Detection Using EfficientNet_B0 on Food101

Step 1: Load Pretrained Model-
We load the EfficientNet model (e.g., efficientnet_b0) which has been pre-trained on ImageNet. This helps the model start with strong visual features without training from scratch.

In [30]:

# Step 1: Import Libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torchvision.datasets import Food101
from torchvision.models import efficientnet_b0
from torch.utils.data import DataLoader, random_split
import time
import os
import copy
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


Step 2: Load and Prepare Dataset-
We use the Food101 dataset, a large vision dataset of 101 food categories. Images are transformed (resized, normalized, converted to tensor) and split into training and test loaders for training and evaluation.

In [33]:

# Step 2: Load and Transform Food101 Dataset
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

food_train = Food101(root='./data', split='train', transform=transform, download=True)
food_test = Food101(root='./data', split='test', transform=transform, download=True)

# Subsample for faster training/testing
train_subset, _ = random_split(food_train, [5000, len(food_train) - 5000])
test_subset, _ = random_split(food_test, [1000, len(food_test) - 1000])

train_loader = DataLoader(train_subset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_subset, batch_size=32, shuffle=False)


Step 3: Replace the Classifier-
We replace the final classification layer of the pretrained EfficientNet model to match our dataset (101 classes for Food101 instead of 1000 ).

In [35]:

# Step 3: Load Pretrained EfficientNet_B0 and Modify Output Layer
model = efficientnet_b0(pretrained=True)
model.classifier[1] = nn.Linear(model.classifier[1].in_features, 101)
model = model.to(device)


Step 4: Evaluate Pretrained Model (Before Fine-Tuning)
We evaluate the model on the test set before any fine-tuning. This gives us a baseline accuracy using the pretrained features.

In [16]:
# Step 4: Test Accuracy Before Fine-Tuning
def evaluate(model, dataloader, device):
    model.to(device)
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    return 100 * correct / total


baseline_acc = evaluate(model, test_loader, device)
print(f"Test Accuracy (before fine-tuning): {baseline_acc:.2f}%")



Test Accuracy (before fine-tuning): 1.50%


Step 5: Fine-Tune the Model-
We train the model for one epoch on the Food101 training set. This helps the model adapt its learned features to the specific task of food classification.

In [36]:
# Step 5: Fine-Tune for 1 Epoch
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

model.train()
for images, labels in train_loader:
    images, labels = images.to(device), labels.to(device)
    optimizer.zero_grad()
    outputs = model(images)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()


finetuned_acc = evaluate(model, test_loader, device)
print(f"Test Accuracy (after fine-tuning): {finetuned_acc:.2f}%")



Test Accuracy (after fine-tuning): 24.60%


Step 6: Evaluate Model (After Fine-Tuning)-
After fine-tuning, we evaluate the model again on the test set to see how much performance improved compared to the pretrained baseline.

In [37]:

# Step 6: Save Model and Measure Size
torch.save(model.state_dict(), "efficientnet_finetuned.pth")
original_size = os.path.getsize("efficientnet_finetuned.pth") / 1e6  # MB
print(f"Model size before quantization: {original_size:.2f} MB")


Model size before quantization: 16.85 MB


Step 7: Measure Model Size Before Quantization-
We save the model to disk and measure its file size. This gives the size of the full-precision (FP32) model before applying any compression.

In [40]:
# Step 7: Quantize Model
quantized_model = torch.quantization.quantize_dynamic(copy.deepcopy(model), {nn.Linear}, dtype=torch.qint8)
torch.save(quantized_model.state_dict(), "efficientnet_quantized.pth")
quantized_size = os.path.getsize("efficientnet_quantized.pth") / 1e6  # MB
print(f"Model size after quantization: {quantized_size:.2f} MB")



Model size after quantization: 16.46 MB


Step 8: Apply Dynamic Quantization-
We use dynamic quantization to convert Linear layers of the model to 8-bit integers (INT8), which reduces model size and may speed up inference — especially on CPU.

In [41]:
# Step 8: Apply Dynamic Quantization
from torch.quantization import quantize_dynamic

quantized_model = quantize_dynamic(
    model.cpu(),                 # move model to CPU before quantization
    {torch.nn.Linear},           # specify layers to quantize
    dtype=torch.qint8            # use int8 for weights
)

quantized_model.eval()          # evaluation mode
quantized_model.to("cpu")       # make sure it's on CPU

quantized_acc = evaluate(quantized_model, test_loader, device="cpu")
print(f"Test Accuracy (after quantizing linear layers only): {quantized_acc:.2f}%")


Test Accuracy (after quantizing linear layers only): 24.80%


Step 9: Measure Inference Latency-
We measure and compare inference time (latency) before and after quantization. This step helps show the speed improvement that quantization can provide, especially on CPU.

In [42]:
# Step 9: Measure Inference Latency

import time
import numpy as np

# Step 9: Inference Latency Comparison
def measure_latency(model, loader, device='cpu', n_samples=10):
    model.to(device)
    model.eval()
    times = []
    count = 0
    with torch.no_grad():
        for images, _ in loader:
            images = images.to(device)
            start = time.time()
            _ = model(images)
            end = time.time()
            times.append((end - start) * 1000)  # milliseconds
            count += 1
            if count >= n_samples:
                break
    return np.mean(times)

# Measure latency
latency_before = measure_latency(model, test_loader, device='cuda')  # original model on GPU
latency_after = measure_latency(quantized_model, test_loader, device='cpu')  # quantized model on CPU

print(f"Inference latency before quantization (GPU): {latency_before:.2f} ms")
print(f"Inference latency after quantization (CPU): {latency_after:.2f} ms")


Inference latency before quantization (GPU): 14.47 ms
Inference latency after quantization (CPU): 2020.89 ms


Step 10: Report Summary Metrics-
We calculate and print:

Memory saving (%) after quantization

Accuracy drop caused by quantization

Test accuracies before and after fine-tuning and quantization

Inference latency (in milliseconds) before and after quantization

In [44]:
# Step 10: Report Summary Metrics We calculate and print:

import time
import numpy as np

def measure_latency(model, loader, device, n_samples=10):
    model.eval()
    model.to(device)
    times = []
    images_shown = 0

    with torch.no_grad():
        for images, _ in loader:
            images = images.to(device)
            start = time.time()
            _ = model(images)
            end = time.time()
            times.append((end - start) * 1000)  # convert to ms
            images_shown += 1
            if images_shown >= n_samples:
                break

    return np.mean(times)

# Run latency for original (GPU) and quantized (CPU) models
latency_before = measure_latency(model, test_loader, device='cuda')        # Original model on GPU
latency_after = measure_latency(quantized_model, test_loader, device='cpu')  # Quantized model on CPU

print(f"Inference latency before quantization (GPU): {latency_before:.2f} ms")
print(f"Inference latency after quantization (CPU): {latency_after:.2f} ms")


Inference latency before quantization (GPU): 10.12 ms
Inference latency after quantization (CPU): 1960.31 ms


In [45]:
import os
import torch

def get_model_size(model, filename="temp.pth"):
    torch.save(model.state_dict(), filename)
    size_mb = os.path.getsize(filename) / 1e6  # MB
    os.remove(filename)
    return size_mb

# Measure size of both models
original_size = get_model_size(model, "original.pth")
quantized_size = get_model_size(quantized_model, "quantized.pth")

# Accuracy metrics (already evaluated earlier)
# finetuned_acc: accuracy after fine-tuning
# quantized_acc: accuracy after quantization

# Memory savings and accuracy drop
memory_saved = ((original_size - quantized_size) / original_size) * 100
accuracy_drop = finetuned_acc - quantized_acc

print(f"Original Model Size: {original_size:.2f} MB")
print(f"Quantized Model Size: {quantized_size:.2f} MB")
print(f"Memory Saving after Quantization: {memory_saved:.2f}%")
print(f"Accuracy Drop after Quantization: {accuracy_drop:.2f}%")


Original Model Size: 16.85 MB
Quantized Model Size: 16.45 MB
Memory Saving after Quantization: 2.33%
Accuracy Drop after Quantization: -0.20%


# Final Summary
| Metric                                 | Value                        |
| -------------------------------------- | ---------------------------- |
| **Device Used**                        | CUDA (T4 GPU)                |
| **Test Accuracy (Before Fine-Tuning)** | 1.50%                        |
| **Test Accuracy (After Fine-Tuning)**  | 24.60%                       |
| **Model Size (Before Quantization)**   | 16.85 MB                     |
| **Model Size (After Quantization)**    | 16.46 MB                     |
| **Memory Saved After Quantization**    | 2.33%                        |
| **Test Accuracy (After Quantization)** | 24.80%                       |
| **Accuracy Drop After Quantization**   | **-0.20%** (slight increase) |
| **Inference Latency (Before, GPU)**    | 10.12 ms                     |
| **Inference Latency (After, CPU)**     | 1960.31 ms                   |


Conclusion
The EfficientNet model was successfully fine-tuned on the Food101 dataset, achieving a 24.60% test accuracy after one epoch of fine-tuning. While this accuracy is modest, it demonstrates that the model is starting to learn useful features specific to the food classification task.

After applying dynamic quantization to linear layers, the model size was reduced from 16.85 MB to 16.46 MB, yielding a 2.33% memory saving. This is a minor reduction since only the linear layers were quantized.

Interestingly, quantization led to a slight increase in accuracy (+0.20%), possibly due to regularization effects.

However, inference latency significantly increased from ~10 ms (GPU) to ~1960 ms (CPU) post-quantization. This is expected because dynamic quantization is designed to optimize CPU inference, and your original model was running on GPU (which is significantly faster for this model type).

This highlights an important trade-off: while quantization can save memory and enable model deployment on lower-end hardware (e.g., mobile devices), it may lead to slower inference unless further optimized for CPU or edge environments.

