In [2]:
# Load subset of Food-101 dataset from Datasets library
from datasets import load_dataset

food = load_dataset("food101", split="train[:5000]")

In [3]:
food = food.train_test_split(test_size=0.2)

In [4]:
labels = food["train"].features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [5]:
# Preprocess
from transformers import AutoImageProcessor

checkpoint = "google/vit-base-patch16-224-in21k"
image_processor = AutoImageProcessor.from_pretrained(checkpoint)

In [6]:
from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor

normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
size = (
    image_processor.size["shortest_edge"]
    if "shortest_edge" in image_processor.size
    else (image_processor.size["height"], image_processor.size["width"])
)
_transforms = Compose([RandomResizedCrop(size), ToTensor(), normalize])

In [7]:
def transforms(examples):
    examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]]
    del examples["image"]
    return examples

In [8]:
food = food.with_transform(transforms)

In [9]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [10]:
import evaluate

accuracy = evaluate.load("accuracy")

In [11]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [12]:
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer

model = AutoModelForImageClassification.from_pretrained(
    checkpoint,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
training_args = TrainingArguments(
    output_dir="my_awesome_food_model",
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=food["train"],
    eval_dataset=food["test"],
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
)

In [15]:
trainer.train()

  5%|▌         | 10/186 [00:26<07:31,  2.57s/it]

{'loss': 4.605, 'learning_rate': 2.6315789473684212e-05, 'epoch': 0.16}


 11%|█         | 20/186 [00:51<07:03,  2.55s/it]

{'loss': 4.3558, 'learning_rate': 4.970059880239521e-05, 'epoch': 0.32}


 16%|█▌        | 30/186 [01:18<06:54,  2.66s/it]

{'loss': 3.8448, 'learning_rate': 4.670658682634731e-05, 'epoch': 0.48}


 22%|██▏       | 40/186 [01:45<06:44,  2.77s/it]

{'loss': 3.4063, 'learning_rate': 4.3712574850299406e-05, 'epoch': 0.64}


 27%|██▋       | 50/186 [02:10<05:49,  2.57s/it]

{'loss': 3.1, 'learning_rate': 4.07185628742515e-05, 'epoch': 0.8}


 32%|███▏      | 60/186 [02:36<05:17,  2.52s/it]

{'loss': 2.8306, 'learning_rate': 3.7724550898203595e-05, 'epoch': 0.96}


                                                
 33%|███▎      | 62/186 [02:57<05:13,  2.53s/it]

{'eval_loss': 2.6525375843048096, 'eval_accuracy': 0.812, 'eval_runtime': 14.9038, 'eval_samples_per_second': 67.097, 'eval_steps_per_second': 4.227, 'epoch': 0.99}


 38%|███▊      | 70/186 [03:17<05:35,  2.89s/it]

{'loss': 2.602, 'learning_rate': 3.473053892215569e-05, 'epoch': 1.12}


 43%|████▎     | 80/186 [03:41<04:24,  2.50s/it]

{'loss': 2.4431, 'learning_rate': 3.1736526946107784e-05, 'epoch': 1.28}


 48%|████▊     | 90/186 [04:06<03:55,  2.45s/it]

{'loss': 2.2696, 'learning_rate': 2.874251497005988e-05, 'epoch': 1.44}


 54%|█████▍    | 100/186 [04:31<03:34,  2.50s/it]

{'loss': 2.1412, 'learning_rate': 2.5748502994011976e-05, 'epoch': 1.6}


 59%|█████▉    | 110/186 [04:56<03:10,  2.50s/it]

{'loss': 2.0119, 'learning_rate': 2.275449101796407e-05, 'epoch': 1.76}


 65%|██████▍   | 120/186 [05:21<02:43,  2.48s/it]

{'loss': 1.9128, 'learning_rate': 1.9760479041916168e-05, 'epoch': 1.92}


                                                 
 67%|██████▋   | 125/186 [05:47<02:31,  2.48s/it]

{'eval_loss': 1.8672749996185303, 'eval_accuracy': 0.846, 'eval_runtime': 14.0191, 'eval_samples_per_second': 71.331, 'eval_steps_per_second': 4.494, 'epoch': 2.0}


 70%|██████▉   | 130/186 [06:01<03:20,  3.57s/it]

{'loss': 1.9139, 'learning_rate': 1.6766467065868263e-05, 'epoch': 2.08}


 75%|███████▌  | 140/186 [06:25<01:52,  2.45s/it]

{'loss': 1.783, 'learning_rate': 1.377245508982036e-05, 'epoch': 2.24}


 81%|████████  | 150/186 [06:49<01:26,  2.41s/it]

{'loss': 1.7273, 'learning_rate': 1.0778443113772455e-05, 'epoch': 2.4}


 86%|████████▌ | 160/186 [07:13<01:02,  2.39s/it]

{'loss': 1.7289, 'learning_rate': 7.784431137724551e-06, 'epoch': 2.56}


 91%|█████████▏| 170/186 [07:37<00:38,  2.43s/it]

{'loss': 1.7028, 'learning_rate': 4.7904191616766475e-06, 'epoch': 2.72}


 97%|█████████▋| 180/186 [08:02<00:14,  2.42s/it]

{'loss': 1.6765, 'learning_rate': 1.7964071856287426e-06, 'epoch': 2.88}


                                                 
100%|██████████| 186/186 [08:30<00:00,  2.41s/it]

{'eval_loss': 1.6671103239059448, 'eval_accuracy': 0.889, 'eval_runtime': 14.2508, 'eval_samples_per_second': 70.171, 'eval_steps_per_second': 4.421, 'epoch': 2.98}


100%|██████████| 186/186 [08:33<00:00,  2.76s/it]

{'train_runtime': 513.0451, 'train_samples_per_second': 23.39, 'train_steps_per_second': 0.363, 'train_loss': 2.5306988377724924, 'epoch': 2.98}





TrainOutput(global_step=186, training_loss=2.5306988377724924, metrics={'train_runtime': 513.0451, 'train_samples_per_second': 23.39, 'train_steps_per_second': 0.363, 'train_loss': 2.5306988377724924, 'epoch': 2.98})

In [16]:
ds = load_dataset("food101", split="validation[:10]")
image = ds["image"][0]

In [17]:
from transformers import pipeline

classifier = pipeline("image-classification", model="my_awesome_food_model/checkpoint-186")
classifier(image)

[{'label': 'beignets', 'score': 0.2093808650970459},
 {'label': 'hamburger', 'score': 0.01796024665236473},
 {'label': 'bruschetta', 'score': 0.012402575463056564},
 {'label': 'prime_rib', 'score': 0.01185260433703661},
 {'label': 'chicken_wings', 'score': 0.011092047207057476}]