# Food-101 â€” ViT Training Notebook

This notebook trains a ViT model on Food-101 using Hugging Face `datasets` + `transformers`.
Cells:
1. Install dependencies
2. Imports & environment checks
3. Load & inspect dataset
4. Create train/val/test splits (fixes applied)
5. Label mappings
6. Image processor & transforms
7. Apply transforms to dataset (preprocessing)
8. Data collator & dataloader test
9. Model creation
10. Training arguments
11. Trainer and training
12. Evaluation & reporting
13. Save / export model

**Practical notes:** reduce batch size if you run out of GPU memory. If running CPU-only, set `fp16=False` and small batch sizes.


In [None]:
!pip install -q transformers datasets accelerate evaluate scikit-learn matplotlib torchvision

In [None]:
import os
import torch
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
from transformers import ViTImageProcessor, ViTForImageClassification, TrainingArguments, Trainer
from torchvision.transforms import (
    Compose, Resize, CenterCrop, RandomResizedCrop, RandomHorizontalFlip, ToTensor, Normalize
)
print("torch:", torch.__version__)
print("cuda available:", torch.cuda.is_available())
print("device:", "cuda" if torch.cuda.is_available() else "cpu")


Load the Food-101 dataset

In [None]:
ds = load_dataset("food101")
print(ds)  # shows available splits (usually 'train' and 'test')
print("Features:", ds['train'].features)
print("Num classes:", len(ds['train'].features['label'].names))
# show a small sample image and label to verify
sample = ds['train'][0]
print("Sample keys:", sample.keys())
print("Sample label id / name:", sample['label'], ds['train'].features['label'].names[sample['label']])
plt.imshow(sample['image'])
plt.axis('off')
plt.title(ds['train'].features['label'].names[sample['label']])

Create train/validation split from the original 'train'


In [None]:
val_fraction = 0.1
split = ds['train'].train_test_split(test_size=val_fraction, seed=42)
datasets = {
    "train": split['train'],
    "val": split['test'],
    "test": ds['test']  # use official test split
}
print({k: len(v) for k, v in datasets.items()})


In [None]:
class_names = datasets['train'].features['label'].names
id2label = {i: name for i, name in enumerate(class_names)}
label2id = {name: i for i, name in id2label.items()}
print("Example labels:", list(id2label.values())[:5])

In [None]:
MODEL_NAME = "google/vit-base-patch16-224"
processor = ViTImageProcessor.from_pretrained(MODEL_NAME)

processor.image_mean and image_std are lists; processor.size is dict or int depending on version


In [None]:
img_size = processor.size["height"] if isinstance(processor.size, dict) else processor.size
mean, std = processor.image_mean, processor.image_std

torchvision transforms (we will convert PIL -> tensor -> normalized using processor's mean/std)

In [None]:
train_transforms = Compose([
    RandomResizedCrop(img_size, scale=(0.8, 1.0)),
    RandomHorizontalFlip(p=0.5),
    ToTensor(),
    Normalize(mean=mean, std=std),
])

val_test_transforms = Compose([
    Resize((img_size, img_size)),
    CenterCrop(img_size),
    ToTensor(),
    Normalize(mean=mean, std=std),
])

print("Image size:", img_size, "mean/std:", mean, std)

In [None]:
from PIL import Image

We'll convert each example image with the transforms and store 'pixel_values' as float tensors.


In [None]:
def train_transform_examples(example):
    imgs = [train_transforms(Image.fromarray(img).convert("RGB")) for img in example["image"]]
    # The dataset expects plain Python objects; Torch tensors are fine, but to be safe convert to numpy
    example["pixel_values"] = [img.numpy() for img in imgs]
    return example

def val_transform_examples(example):
    imgs = [val_test_transforms(Image.fromarray(img).convert("RGB")) for img in example["image"]]
    example["pixel_values"] = [img.numpy() for img in imgs]
    return example

# Set transforms. These are applied lazily on access.
datasets["train"] = datasets["train"].with_transform(train_transform_examples)
datasets["val"] = datasets["val"].with_transform(val_transform_examples)
datasets["test"] = datasets["test"].with_transform(val_transform_examples)

# Quick check on one item
item = datasets["train"][0]
print("pixel_values length:", len(item["pixel_values"]), "label:", item["label"])
import torch
print("pixel_values shape example (converted to tensor):", torch.tensor(item["pixel_values"][0]).shape)

In [None]:
import torch
def collate_fn(batch):
    pixel_vals = torch.stack([torch.tensor(x["pixel_values"][0]) if isinstance(x["pixel_values"], list) else torch.tensor(x["pixel_values"]) for x in batch])
    labels = torch.tensor([x["label"] for x in batch])
    return {"pixel_values": pixel_vals, "labels": labels}

# Test DataLoader (small batch)
from torch.utils.data import DataLoader
train_dl = DataLoader(datasets["train"], batch_size=4, shuffle=True, collate_fn=collate_fn)
batch = next(iter(train_dl))
print("Batch keys:", batch.keys())
print("pixel_values shape:", batch["pixel_values"].shape)
print("labels shape:", batch["labels"].shape)


In [None]:
model = ViTForImageClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)
model.to("cuda" if torch.cuda.is_available() else "cpu")
print(model)

In [None]:
from transformers import TrainingArguments

OUTPUT_DIR = "food101-vit-model"
# if no GPU, set fp16=False and reduce batch sizes
train_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=8 if torch.cuda.is_available() else 2,
    per_device_eval_batch_size=16 if torch.cuda.is_available() else 4,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    logging_steps=100,
    num_train_epochs=3,
    learning_rate=2e-4,
    warmup_steps=1000,
    fp16=torch.cuda.is_available(),  # only enable if CUDA available
    save_total_limit=2,
    remove_unused_columns=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    dataloader_num_workers=4,
    push_to_hub=False,
)
print(train_args)


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    acc = accuracy_score(labels, preds)
    p_r_f_s = precision_recall_fscore_support(labels, preds, average='macro', zero_division=0)
    return {
        "accuracy": acc,
        "precision_macro": p_r_f_s[0],
        "recall_macro": p_r_f_s[1],
        "f1_macro": p_r_f_s[2]
    }

trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["val"],
    data_collator=collate_fn,
    tokenizer=processor,   # ViTImageProcessor works here as tokenizer/feature extractor
    compute_metrics=compute_metrics,
)
print("Trainer created.")



For quick testing, set num_train_epochs=1 and/or use a small subset of the dataset.


In [None]:
trainer.train()

In [None]:
small_train = datasets["train"].select(range(256))
small_val = datasets["val"].select(range(128))
trainer_small = Trainer(
    model=model,
    args=train_args,
    train_dataset=small_train,
    eval_dataset=small_val,
    data_collator=collate_fn,
    tokenizer=processor,
    compute_metrics=compute_metrics,
)
trainer_small.train()

In [None]:

preds_output = trainer.predict(datasets["test"])
print("Test metrics (HF):", preds_output.metrics)

y_pred = np.argmax(preds_output.predictions, axis=1)
y_true = preds_output.label_ids

from sklearn.metrics import classification_report, confusion_matrix
print("Overall accuracy:", (y_pred == y_true).mean())
print("\nClassification report (first 10 classes):")
print(classification_report(y_true, y_pred, labels=list(range(10)), target_names=class_names[:10], zero_division=0))

In [None]:
trainer.save_model(OUTPUT_DIR)
processor.save_pretrained(OUTPUT_DIR)
print("Saved model and processor to", OUTPUT_DIR)