<a href="https://colab.research.google.com/github/TensorCruncher/food-image-classifier/blob/main/food_classifier_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Introduction

* In this notebook 📘 we will fine tune an EffNetB2 neural network pretrained on the ImageNet
dataset on the Food101 🍔 🌮 🍕 dataset to create a food image classifier.

* But first, we will fine tune and test three different architectures: AlexNet, EffNetB2 and ViT-B/16 on a smaller subset of Food101 (3 classes, 200 images/class).

* Then, we create and train an EffNetB2 instance on a larger subset of Food101 (101 classes, 200 images/class).

* We then deploy this model on hugging face spaces via gradio.

# 2. Setup

In [None]:
import torch
from torch import nn

import torchvision
from torchvision import transforms
from torchvision import datasets

!pip install -q torchinfo
from torchinfo import summary

import pandas as pd
import matplotlib.pyplot as plt

import os
from pathlib import Path

from google.colab import files


In [None]:
!git clone https://github.com/TensorCruncher/food-image-classifier
!mv food-image-classifier/utils .
!rm -rf food-image-classifier

In [None]:
from utils import data
from utils import engine
from utils import misc
from utils import model
from utils import predict

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
device

# 3. Downloading the Data

In [None]:
data_20_percent_path = data.download_data(source="https://github.com/TensorCruncher/food-image-classifier/raw/refs/heads/main/data/pizza_samosa_tacos_20_percent.zip",
                                          destination="pizza_samosa_tacos_20_percent")

In [None]:
data_20_percent_path

In [None]:
train_dir = data_20_percent_path / "train"
test_dir = data_20_percent_path / "test"

# 4. AlexNet

In [None]:
alexnet, alexnet_transforms = model.create_alexnet_model(num_classes=3,
                                                         seed=42)

In [None]:
summary(alexnet,
        input_size=(1, 3, 224, 224),
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"])

In [None]:
train_dataloader_alexnet, test_dataloader_alexnet, class_names = data.create_dataloaders(train_dir=train_dir,
                                                                                         test_dir=test_dir,
                                                                                         transform=alexnet_transforms,
                                                                                         batch_size=32)

In [None]:
writer = engine.create_writer(experiment_name="data_20_percent",
                              model_name="alexnet",
                              extra="10_epochs")

In [None]:
optimizer = torch.optim.Adam(params=alexnet.parameters(),
                             lr=1e-3)
loss_fn = torch.nn.CrossEntropyLoss()

misc.set_seeds()
alexnet_results = engine.train(model=alexnet,
                               train_dataloader=train_dataloader_alexnet,
                               test_dataloader=test_dataloader_alexnet,
                               epochs=10,
                               optimizer=optimizer,
                               loss_fn=loss_fn,
                               device=device,
                               writer=writer)

In [None]:
%load_ext tensorboard
%tensorboard --logdir runs

In [None]:
misc.save_model(model=alexnet,
                target_dir="models",
                model_name="pretrained_alexnet_feature_extractor_pizza_samosa_tacos_20_percent.pth")

In [None]:
pretrained_alexnet_model_size = Path("models/pretrained_alexnet_feature_extractor_pizza_samosa_tacos_20_percent.pth").stat().st_size // (1024*1024)
print(f"Pretrained Alexnet feature extractor model size: {pretrained_alexnet_model_size} MB")

In [None]:
alexnet_total_params = sum(torch.numel(param) for param in alexnet.parameters())
alexnet_total_params

In [None]:
alexnet_stats = {"test_loss": alexnet_results["test_loss"][-1],
                  "test_acc": alexnet_results["test_acc"][-1],
                  "number_of_parameters": alexnet_total_params,
                  "model_size (MB)": pretrained_alexnet_model_size}
alexnet_stats

In [None]:
files.download("models/pretrained_alexnet_feature_extractor_pizza_samosa_tacos_20_percent.pth")

# 5. EffNetB2

In [None]:
effnetb2, effnetb2_transforms = model.create_effnetb2_model(num_classes=3,
                                                            seed=42)

In [None]:
summary(effnetb2,
        input_size=(1, 3, 224, 224),
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"])

In [None]:
train_dataloader_effnetb2, test_dataloader_effnetb2, class_names = data.create_dataloaders(train_dir=train_dir,
                                                                                           test_dir=test_dir,
                                                                                           transform=effnetb2_transforms,
                                                                                           batch_size=32)

In [None]:
writer = engine.create_writer(experiment_name="data_20_percent",
                              model_name="effnetb2",
                              extra="10_epochs")

In [None]:
optimizer = torch.optim.Adam(params=effnetb2.parameters(),
                             lr=1e-3)
loss_fn = torch.nn.CrossEntropyLoss()

misc.set_seeds()
effnetb2_results = engine.train(model=effnetb2,
                                train_dataloader=train_dataloader_effnetb2,
                                test_dataloader=test_dataloader_effnetb2,
                                epochs=10,
                                optimizer=optimizer,
                                loss_fn=loss_fn,
                                device=device,
                                writer=writer)

In [None]:
%tensorboard --logdir runs

In [None]:
misc.save_model(model=effnetb2,
                target_dir="models",
                model_name="pretrained_effnetb2_feature_extractor_pizza_samosa_tacos_20_percent.pth")

In [None]:
pretrained_effnetb2_model_size = Path("models/pretrained_effnetb2_feature_extractor_pizza_samosa_tacos_20_percent.pth").stat().st_size // (1024*1024)
print(f"Pretrained EffNetB2 feature extractor model size: {pretrained_effnetb2_model_size} MB")

In [None]:
effnetb2_total_params = sum(torch.numel(param) for param in effnetb2.parameters())
effnetb2_total_params

In [None]:
effnetb2_stats = {"test_loss": effnetb2_results["test_loss"][-1],
                  "test_acc": effnetb2_results["test_acc"][-1],
                  "number_of_parameters": effnetb2_total_params,
                  "model_size (MB)": pretrained_effnetb2_model_size}
effnetb2_stats

In [None]:
files.download("models/pretrained_effnetb2_feature_extractor_pizza_samosa_tacos_20_percent.pth")

# 6. ViT-B/16

In [None]:
vit, vit_transforms = model.create_vit_model(num_classes=3,
                                             seed=42)

In [None]:
summary(vit,
        input_size=(1, 3, 224, 224),
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"])

In [None]:
train_dataloader_vit, test_dataloader_vit, class_names = data.create_dataloaders(train_dir=train_dir,
                                                                                 test_dir=test_dir,
                                                                                 transform=vit_transforms,
                                                                                 batch_size=32)

In [None]:
writer = engine.create_writer(experiment_name="data_20_percent",
                              model_name="vit",
                              extra="10_epochs")

In [None]:
optimizer = torch.optim.Adam(params=vit.parameters(),
                             lr=1e-3)
loss_fn = torch.nn.CrossEntropyLoss()

misc.set_seeds()
vit_results = engine.train(model=vit,
                           train_dataloader=train_dataloader_vit,
                           test_dataloader=test_dataloader_vit,
                           epochs=10,
                           optimizer=optimizer,
                           loss_fn=loss_fn,
                           device=device,
                           writer=writer)

In [None]:
%tensorboard --logdir runs

In [None]:
misc.save_model(model=vit,
                target_dir="models",
                model_name="pretrained_vit_feature_extractor_pizza_samosa_tacos_20_percent.pth")

In [None]:
pretrained_vit_model_size = Path("models/pretrained_vit_feature_extractor_pizza_samosa_tacos_20_percent.pth").stat().st_size // (1024*1024)
print(f"Pretrained ViT feature extractor model size: {pretrained_vit_model_size} MB")

In [None]:
vit_total_params = sum(torch.numel(param) for param in vit.parameters())
vit_total_params

In [None]:
vit_stats = {"test_loss": vit_results["test_loss"][-1],
             "test_acc": vit_results["test_acc"][-1],
             "number_of_parameters": vit_total_params,
             "model_size (MB)": pretrained_vit_model_size}
vit_stats

In [None]:
files.download("models/pretrained_vit_feature_extractor_pizza_samosa_tacos_20_percent.pth")

# 7. Calculating the inference time / image for our models

In [None]:
print(f"[INFO] Finding all filepaths ending with '.jpg' in directory: {test_dir}")
test_data_paths = list(Path(test_dir).glob("*/*.jpg"))
test_data_paths[:5]

## 7.1 Timing Alexnet

In [None]:
alexnet_test_pred_dicts = predict.pred_and_store(paths=test_data_paths,
                                         model=alexnet,
                                         transform=alexnet_transforms,
                                         class_names=class_names,
                                         device="cpu")

In [None]:
alexnet_test_pred_dicts[:2]

In [None]:
alexnet_test_pred_df = pd.DataFrame(alexnet_test_pred_dicts)
alexnet_test_pred_df.head()

In [None]:
alexnet_test_pred_df.correct.value_counts()

In [None]:
alexnet_average_time_per_pred = round(alexnet_test_pred_df.time_for_pred.mean(), 4)
print(f"Alexnet average time per prediction: {alexnet_average_time_per_pred} seconds")

In [None]:
alexnet_stats["time_per_pred_cpu"] = float(alexnet_average_time_per_pred)
alexnet_stats

## 7.2 Timing EffNetB2

In [None]:
effnetb2_test_pred_dicts = predict.pred_and_store(paths=test_data_paths,
                                          model=effnetb2,
                                          transform=effnetb2_transforms,
                                          class_names=class_names,
                                          device="cpu")

In [None]:
effnetb2_test_pred_dicts[:2]

In [None]:
effnetb2_test_pred_df = pd.DataFrame(effnetb2_test_pred_dicts)
effnetb2_test_pred_df.head()

In [None]:
effnetb2_test_pred_df.correct.value_counts()

In [None]:
effnetb2_average_time_per_pred = round(effnetb2_test_pred_df.time_for_pred.mean(), 4)
print(f"EffNetB2 average time per prediction: {effnetb2_average_time_per_pred} seconds")

In [None]:
effnetb2_stats["time_per_pred_cpu"] = float(effnetb2_average_time_per_pred)
effnetb2_stats

## 7.3 Timing ViT

In [None]:
vit_test_pred_dicts = predict.pred_and_store(paths=test_data_paths,
                                     model=vit,
                                     transform=vit_transforms,
                                     class_names=class_names,
                                     device="cpu")

In [None]:
vit_test_pred_dicts[:2]

In [None]:
vit_test_pred_df = pd.DataFrame(vit_test_pred_dicts)
vit_test_pred_df.head()

In [None]:
vit_test_pred_df.correct.value_counts()

In [None]:
vit_average_time_per_pred = round(vit_test_pred_df.time_for_pred.mean(), 4)
print(f"ViT average time per prediction: {vit_average_time_per_pred} seconds")

In [None]:
vit_stats["time_per_pred_cpu"] = float(vit_average_time_per_pred)
vit_stats

# 8. Comparing model size, performance and inference time

In [None]:
df = pd.DataFrame([alexnet_stats, effnetb2_stats, vit_stats])

df["model"] = ["AlexNet", "EffNetB2", "ViT"]

df["test_acc"] = round(df["test_acc"] * 100, 2)

df

In [None]:
model_df = df.set_index("model")

vit_to_effnet = model_df.loc["ViT"] / model_df.loc["EffNetB2"]
vit_to_alexnet = model_df.loc["ViT"] / model_df.loc["AlexNet"]
effnet_to_alexnet = model_df.loc["EffNetB2"] / model_df.loc["AlexNet"]

pd.DataFrame({
    "ViT to EffNetB2": vit_to_effnet,
    "EffNetB2 to AlexNet": effnet_to_alexnet,
    "ViT to AlexNet": vit_to_alexnet
}).T

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
scatter = ax.scatter(data=df,
                     x="time_per_pred_cpu",
                     y="test_acc",
                     c=["blue", "orange", "green"],
                     s="model_size (MB)")

ax.set_title("Inference Speed vs Performance", fontsize=18)
ax.set_xlabel("Prediction time per image (seconds)", fontsize=14)
ax.set_ylabel("Test accuracy (%)", fontsize=14)
ax.tick_params(axis='both', labelsize=12)
ax.grid(True)

for index, row in df.iterrows():
    ax.annotate(text=row["model"],
                xy=(row["time_per_pred_cpu"]+0.0006, row["test_acc"]+0.03),
                size=12)

handles, labels = scatter.legend_elements(prop="sizes", alpha=0.5)
model_size_legend = ax.legend(handles,
                              labels,
                              loc="lower right",
                              title="Model size (MB)",
                              fontsize=12)

!mdkir images/
plt.savefig("model-inference-time-vs-accuracy.jpg")

plt.show()

In [None]:
files.download("model-inference-time-vs-accuracy.jpg")

We can see in the above analysis that EffNetB2 provides slightly higher test accuracy for about 1/10 th the model size compared to AlexNet. This is advantageous despite it taking twice the time per prediction.

While ViT gives the best test accuracy, its model size is also more than 10x the EffNet model size. Further, it takes about 4x the time to make a prediction.

Keeping in mind the above factors, we will use the EffNetB2 architecture going forward to train on a larger subset of the Food101 dataset. It will still have 20% of the data per class, but we will use all 101 classes instead of of just the three we have used so far.

# 9. Training the model on a larger subset of Food101

In [None]:
effnetb2_food101, effnetb2_transforms = model.create_effnetb2_model(num_classes=101)

In [None]:
summary(effnetb2_food101,
        input_size=(1, 3, 224, 224),
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"])

In [None]:
food101_train_transforms = torchvision.transforms.Compose([
    torchvision.transforms.TrivialAugmentWide(),
    effnetb2_transforms,
])

In [None]:
print(f"Training transforms:\n{food101_train_transforms}\n")
print(f"Testing transforms:\n{effnetb2_transforms}")

In [None]:
data_dir = Path("data")

train_data = datasets.Food101(root=data_dir,
                              split="train",
                              transform=food101_train_transforms,
                              download=True)

test_data = datasets.Food101(root=data_dir,
                             split="test",
                             transform=effnetb2_transforms,
                             download=True)

In [None]:
food101_class_names = train_data.classes

food101_class_names[:10]

In [None]:
train_data_food101_20_percent, _ = data.split_dataset(dataset=train_data,
                                                 split_size=0.2)

test_data_food101_20_percent, _ = data.split_dataset(dataset=test_data,
                                                split_size=0.2)

len(train_data_food101_20_percent), len(test_data_food101_20_percent)

In [None]:
NUM_WORKERS = 2 if os.cpu_count() <= 4 else 4

train_dataloader_food101_20_percent = torch.utils.data.DataLoader(train_data_food101_20_percent,
                                                                  batch_size=32,
                                                                  shuffle=True,
                                                                  num_workers=NUM_WORKERS)

test_dataloader_food101_20_percent = torch.utils.data.DataLoader(test_data_food101_20_percent,
                                                                 batch_size=32,
                                                                 shuffle=False,
                                                                 num_workers=NUM_WORKERS)

In [None]:
writer = engine.create_writer(experiment_name="food_101_20_percent",
                              model_name="effnetb2",
                              extra="10_epochs")

In [None]:
optimizer = torch.optim.Adam(params=effnetb2_food101.parameters(),
                             lr=1e-3)

loss_fn = torch.nn.CrossEntropyLoss(label_smoothing=0.1)

misc.set_seeds()
effnetb2_food101_results = engine.train(model=effnetb2_food101,
                                        train_dataloader=train_dataloader_food101_20_percent,
                                        test_dataloader=test_dataloader_food101_20_percent,
                                        optimizer=optimizer,
                                        loss_fn=loss_fn,
                                        epochs=10,
                                        device=device,
                                        writer=writer)

In [None]:
%tensorboard --logdir runs

In [None]:
effnetb2_food101_model_path = "pretrained_effnetb2_feature_extractor_food101_20_percent.pth"

misc.save_model(model=effnetb2_food101,
                target_dir="models",
                model_name=effnetb2_food101_model_path)

In [None]:
effnetb2_food101_total_params = sum(torch.numel(param) for param in effnetb2_food101.parameters())
effnetb2_food101_total_params

In [None]:
pretrained_effnetb2_food101_model_size = Path("models", effnetb2_food101_model_path).stat().st_size // (1024*1024)
print(f"Pretrained EffNetB2 feature extractor Food101 model size: {pretrained_effnetb2_food101_model_size} MB")

In [None]:
effnetb2_food101_stats = {"test_loss": effnetb2_food101_results["test_loss"][-1],
                          "test_acc": effnetb2_food101_results["test_acc"][-1],
                          "number_of_parameters": effnetb2_food101_total_params,
                          "model_size (MB)": pretrained_effnetb2_food101_model_size}
effnetb2_food101_stats

In [None]:
files.download("models/pretrained_effnetb2_feature_extractor_food101_20_percent.pth")

# 10. Creating Gradio Demo

In [None]:
food_classifier_demo_path = Path("demos/food_classifier/")

food_classifier_demo_path.mkdir(parents=True, exist_ok=True)

(food_classifier_demo_path / "examples").mkdir(parents=True, exist_ok=True)

In [None]:
!wget https://raw.githubusercontent.com/TensorCruncher/food-image-classifier/main/images/pizza.jpg
!wget https://raw.githubusercontent.com/TensorCruncher/food-image-classifier/main/images/samosa.jpg
!wget https://raw.githubusercontent.com/TensorCruncher/food-image-classifier/main/images/tacos.jpg
!wget https://raw.githubusercontent.com/TensorCruncher/food-image-classifier/main/images/red_velvet_cake.jpg

!mv pizza.jpg demos/food_classifier/examples/pizza.jpg
!mv samosa.jpg demos/food_classifier/examples/samosa.jpg
!mv tacos.jpg demos/food_classifier/examples/tacos.jpg
!mv red_velvet_cake.jpg demos/food_classifier/examples/red_velvet_cake.jpg

!mv models/pretrained_effnetb2_feature_extractor_food101_20_percent.pth demos/food_classifier

In [None]:
food101_class_names[:10]

In [None]:
food_classifier_class_names_path = food_classifier_demo_path / "class_names.txt"

with open(food_classifier_class_names_path, "w") as f:
    print(f"[INFO] Saving Food101 class names to {food_classifier_class_names_path}")
    f.write("\n".join(food101_class_names))

In [None]:
with open(food_classifier_class_names_path, "r") as f:
    food101_class_names_loaded = [food.strip() for food in  f.readlines()]

food101_class_names_loaded[:5]

In [None]:
%%writefile demos/food_classifier/model.py
import torch
import torchvision

from torch import nn


def create_effnetb2_model(num_classes:int=3,
                          seed:int=42):
    """Creates an EfficientNetB2 feature extractor model and transforms.

    Args:
        num_classes (int, optional): number of classes in the classifier head.
            Defaults to 3.
        seed (int, optional): random seed value. Defaults to 42.

    Returns:
        model (torch.nn.Module): EffNetB2 feature extractor model.
        transforms (torchvision.transforms): EffNetB2 image transforms.
    """
    weights = torchvision.models.EfficientNet_B2_Weights.DEFAULT
    transforms = weights.transforms()
    model = torchvision.models.efficientnet_b2(weights=weights)

    for param in model.parameters():
        param.requires_grad = False

    torch.manual_seed(seed)
    model.classifier = nn.Sequential(
        nn.Dropout(p=0.3, inplace=True),
        nn.Linear(in_features=1408, out_features=num_classes),
    )

    return model, transforms

In [None]:
%%writefile demos/food_classifier/app.py
import gradio as gr
import os
import torch

from model import create_effnetb2_model
from timeit import default_timer as timer
from typing import Tuple, Dict

with open("class_names.txt", "r") as f:
    class_names = [food_name.strip() for food_name in  f.readlines()]

effnetb2, effnetb2_transforms = create_effnetb2_model(
    num_classes=101,
)

effnetb2.load_state_dict(
    torch.load(
        f="pretrained_effnetb2_feature_extractor_food101_20_percent.pth",
        map_location=torch.device("cpu"),
    )
)

def predict(img) -> Tuple[Dict, float]:
    """Transforms and performs a prediction on img and returns prediction and time taken.
    """
    start_time = timer()

    img = effnetb2_transforms(img).unsqueeze(0)

    effnetb2.eval()
    with torch.inference_mode():
        pred_probs = torch.softmax(effnetb2(img), dim=1)

    pred_labels_and_probs = {class_names[i]: float(pred_probs[0][i]) for i in range(len(class_names))}

    pred_time = round(timer() - start_time, 5)

    return pred_labels_and_probs, pred_time

title = "Food Image Classifier 🍕 🍔 🌮 🍰"
description = "An EfficientNetB2 model that classifies images of food into [101 different classes](https://github.com/TensorCruncher/food-image-classifier/blob/main/data/food101_class_names.txt)."
article = "View on [GitHub](https://github.com/TensorCruncher/food-image-classifier)."

example_list = [["examples/" + example] for example in os.listdir("examples")]

demo = gr.Interface(
    fn=predict,
    inputs=gr.Image(type="pil"),
    outputs=[
        gr.Label(num_top_classes=5, label="Predictions"),
        gr.Number(label="Prediction time (s)"),
    ],
    examples=example_list,
    title=title,
    description=description,
    article=article,
)

demo.launch()

In [None]:
%%writefile demos/food_classifier/requirements.txt
torch
torchvision
gradio

In [None]:
!cd demos/food_classifier && zip -r ../food_classifier.zip * -x "*.pyc" "*.ipynb" "*__pycache__*" "*ipynb_checkpoints*"

files.download("demos/food_classifier.zip")