This notebook is using transfer learning from a ViT model from huggingface


In [2]:
import multiprocessing
import transformers
import torch
import wandb
import numpy as np
import pytorch_lightning as pl

from torchvision import transforms 
from torchvision.datasets import Food101
from torchvision.datasets.utils import download_url
from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor

from datasets import load_metric, load_dataset
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer
from transformers import AutoFeatureExtractor
from transformers import DefaultDataCollator

In [3]:
metric = load_metric("accuracy")
food = load_dataset("food101", split="train")

splits = food.train_test_split(test_size=0.15)
train = splits['train']
val = splits['test']

display(train.shape, val.shape)

Using custom data configuration default
Reusing dataset food101 (C:\Users\truon\.cache\huggingface\datasets\food101\default\0.0.0\7cebe41a80fb2da3f08fcbef769c8874073a86346f7fb96dc0847d4dfc318295)


(64387, 2)

(11363, 2)

In [4]:
# Process image to tensor using ViT method (16x16 patches)
feature_extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
feature_extractor

ViTFeatureExtractor {
  "do_normalize": true,
  "do_resize": true,
  "feature_extractor_type": "ViTFeatureExtractor",
  "image_mean": [
    0.5,
    0.5,
    0.5
  ],
  "image_std": [
    0.5,
    0.5,
    0.5
  ],
  "resample": 2,
  "size": 224
}

The `AutoFeatureExtractor.from_pretrained()` method helps us make sure we are 
- (1) resizing the inputs to the appropriate size 
- (2) using the appropriate image mean and standard deviation for the model architecture we are going to use

In [5]:
normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
_transforms = Compose(
    [
            RandomResizedCrop(feature_extractor.size)
            ,ToTensor()
            ,normalize
    ]
)

val_transforms = Compose(
        [
            RandomResizedCrop(feature_extractor.size),
            ToTensor(),
            normalize,
        ]
    )

def transforms(examples):
    examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]]
    del examples["image"]
    return examples

# when the data is loaded, it will apply the transformation above
train = train.with_transform(transforms)
val = val.with_transform(transforms)

# example of how our data is organized, dictionary with 2 key-value pairs
train[0]


{'label': 35,
 'pixel_values': tensor([[[ 0.4431,  0.4824,  0.5059,  ...,  0.9922,  0.9922,  0.9922],
          [ 0.4510,  0.4196,  0.4510,  ...,  0.9922,  0.9843,  0.9843],
          [ 0.3490,  0.3490,  0.3490,  ...,  0.9922,  1.0000,  0.9922],
          ...,
          [ 0.0118,  0.0118, -0.0118,  ...,  0.9216,  0.8980,  0.8902],
          [ 0.0118,  0.0039,  0.0196,  ...,  0.9059,  0.9059,  0.8980],
          [ 0.0196, -0.0118,  0.0196,  ...,  0.8824,  0.8902,  0.8667]],
 
         [[-0.1294, -0.0745, -0.0353,  ...,  1.0000,  1.0000,  1.0000],
          [-0.1294, -0.1294, -0.0745,  ...,  1.0000,  0.9922,  0.9922],
          [-0.1922, -0.2078, -0.2235,  ...,  0.9922,  0.9922,  1.0000],
          ...,
          [-0.3569, -0.3490, -0.3725,  ...,  0.4902,  0.4667,  0.4588],
          [-0.3490, -0.3490, -0.3333,  ...,  0.4824,  0.4745,  0.4745],
          [-0.3333, -0.3647, -0.3333,  ...,  0.4667,  0.4588,  0.4431]],
 
         [[-0.6000, -0.5451, -0.5059,  ...,  0.8824,  0.8824,  0.8824]

In [6]:
# 3 RGBs, 224 x 224 pixels
train[0]['pixel_values'].shape

torch.Size([3, 224, 224])

In [7]:
# ~ creating a dataloader, creates batches 
# `pt` is for PyTorch Tensor
data_collator = DefaultDataCollator()
data_collator

DefaultDataCollator(return_tensors='pt')

In [8]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [9]:
labels = train.features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

model = AutoModelForImageClassification.from_pretrained(
    pretrained_model_name_or_path="google/vit-base-patch16-224-in21k",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)

Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
training_args = TrainingArguments(
    output_dir="../102722run",
    per_device_train_batch_size=32,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=30,
    fp16=True,
    save_steps=100,
    eval_steps=100,
    logging_steps=10,
    learning_rate=2e-4,
    save_total_limit=2,
    remove_unused_columns=False,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="wandb",
    push_to_hub=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train,
    eval_dataset=val,
    compute_metrics = compute_metrics,
    tokenizer=feature_extractor,
)


c:\Users\truon\Documents\projects\food\notebooks\../102722run is already a clone of https://huggingface.co/stochastic/102722run. Make sure you pull the latest changes with `repo.git_pull()`.
Using cuda_amp half precision backend


In [11]:
train_results = trainer.train()
trainer.save_model(output_dir = "./models/")
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

***** Running training *****
  Num examples = 64387
  Num Epochs = 30
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 60390
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mwinsontruong[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/60390 [00:00<?, ?it/s]

{'loss': 4.5771, 'learning_rate': 0.00019996688193409505, 'epoch': 0.0}
{'loss': 4.4866, 'learning_rate': 0.00019993376386819012, 'epoch': 0.01}
{'loss': 4.4235, 'learning_rate': 0.00019990064580228516, 'epoch': 0.01}
{'loss': 4.2814, 'learning_rate': 0.0001998675277363802, 'epoch': 0.02}
{'loss': 4.1408, 'learning_rate': 0.00019983440967047528, 'epoch': 0.02}
{'loss': 4.0479, 'learning_rate': 0.00019980129160457032, 'epoch': 0.03}
{'loss': 3.9138, 'learning_rate': 0.00019976817353866536, 'epoch': 0.03}
{'loss': 3.8012, 'learning_rate': 0.0001997350554727604, 'epoch': 0.04}
{'loss': 3.7385, 'learning_rate': 0.00019970193740685544, 'epoch': 0.04}
{'loss': 3.5957, 'learning_rate': 0.0001996688193409505, 'epoch': 0.05}
{'loss': 3.5138, 'learning_rate': 0.00019963570127504553, 'epoch': 0.05}
{'loss': 3.4593, 'learning_rate': 0.0001996025832091406, 'epoch': 0.06}
{'loss': 3.2644, 'learning_rate': 0.00019956946514323564, 'epoch': 0.06}
{'loss': 3.241, 'learning_rate': 0.00019953634707733068,

***** Running Evaluation *****
  Num examples = 11363
  Batch size = 8


  0%|          | 0/1421 [00:00<?, ?it/s]

Saving model checkpoint to ../102722run\checkpoint-2013
Configuration saved in ../102722run\checkpoint-2013\config.json


{'eval_loss': 1.159117341041565, 'eval_accuracy': 0.7056235149168354, 'eval_runtime': 103.8891, 'eval_samples_per_second': 109.376, 'eval_steps_per_second': 13.678, 'epoch': 1.0}


Model weights saved in ../102722run\checkpoint-2013\pytorch_model.bin
Feature extractor saved in ../102722run\checkpoint-2013\preprocessor_config.json
Feature extractor saved in ../102722run\preprocessor_config.json


{'loss': 1.0039, 'learning_rate': 0.00019331346249379036, 'epoch': 1.0}
{'loss': 1.0572, 'learning_rate': 0.00019328034442788543, 'epoch': 1.01}
{'loss': 1.0777, 'learning_rate': 0.00019324722636198047, 'epoch': 1.01}
{'loss': 1.1844, 'learning_rate': 0.00019321410829607551, 'epoch': 1.02}
{'loss': 0.8593, 'learning_rate': 0.00019318099023017058, 'epoch': 1.02}
{'loss': 0.9875, 'learning_rate': 0.00019314787216426563, 'epoch': 1.03}
{'loss': 1.1458, 'learning_rate': 0.00019311475409836067, 'epoch': 1.03}
{'loss': 1.1599, 'learning_rate': 0.0001930816360324557, 'epoch': 1.04}
{'loss': 0.9967, 'learning_rate': 0.00019304851796655078, 'epoch': 1.04}
{'loss': 1.1294, 'learning_rate': 0.00019301539990064582, 'epoch': 1.05}
{'loss': 1.0354, 'learning_rate': 0.00019298228183474086, 'epoch': 1.05}
{'loss': 1.2394, 'learning_rate': 0.0001929491637688359, 'epoch': 1.06}
{'loss': 0.9632, 'learning_rate': 0.00019291604570293095, 'epoch': 1.06}
{'loss': 1.1097, 'learning_rate': 0.000192882927637026

***** Running Evaluation *****
  Num examples = 11363
  Batch size = 8


  0%|          | 0/1421 [00:00<?, ?it/s]

Saving model checkpoint to ../102722run\checkpoint-4026
Configuration saved in ../102722run\checkpoint-4026\config.json


{'eval_loss': 1.0900472402572632, 'eval_accuracy': 0.723664525213412, 'eval_runtime': 105.2453, 'eval_samples_per_second': 107.967, 'eval_steps_per_second': 13.502, 'epoch': 2.0}


Model weights saved in ../102722run\checkpoint-4026\pytorch_model.bin
Feature extractor saved in ../102722run\checkpoint-4026\preprocessor_config.json
Feature extractor saved in ../102722run\preprocessor_config.json


{'loss': 1.1292, 'learning_rate': 0.0001866666666666667, 'epoch': 2.0}
{'loss': 0.767, 'learning_rate': 0.00018663354860076173, 'epoch': 2.01}
{'loss': 0.8348, 'learning_rate': 0.00018660043053485677, 'epoch': 2.01}
{'loss': 0.9144, 'learning_rate': 0.0001865673124689518, 'epoch': 2.02}
{'loss': 0.7701, 'learning_rate': 0.00018653419440304686, 'epoch': 2.02}
{'loss': 0.8194, 'learning_rate': 0.0001865010763371419, 'epoch': 2.03}
{'loss': 0.7605, 'learning_rate': 0.00018646795827123697, 'epoch': 2.03}
{'loss': 0.742, 'learning_rate': 0.000186434840205332, 'epoch': 2.04}
{'loss': 0.8735, 'learning_rate': 0.00018640172213942705, 'epoch': 2.04}
{'loss': 0.9402, 'learning_rate': 0.00018636860407352212, 'epoch': 2.05}
{'loss': 0.8447, 'learning_rate': 0.00018633548600761716, 'epoch': 2.05}
{'loss': 0.8801, 'learning_rate': 0.0001863023679417122, 'epoch': 2.06}
{'loss': 0.9024, 'learning_rate': 0.00018626924987580728, 'epoch': 2.06}
{'loss': 1.006, 'learning_rate': 0.00018623613180990232, 'ep

***** Running Evaluation *****
  Num examples = 11363
  Batch size = 8


  0%|          | 0/1421 [00:00<?, ?it/s]

Saving model checkpoint to ../102722run\checkpoint-6039
Configuration saved in ../102722run\checkpoint-6039\config.json


{'eval_loss': 1.0368103981018066, 'eval_accuracy': 0.7319369884713544, 'eval_runtime': 105.8044, 'eval_samples_per_second': 107.396, 'eval_steps_per_second': 13.43, 'epoch': 3.0}


Model weights saved in ../102722run\checkpoint-6039\pytorch_model.bin
Feature extractor saved in ../102722run\checkpoint-6039\preprocessor_config.json
Feature extractor saved in ../102722run\preprocessor_config.json


NotADirectoryError: [WinError 267] The directory name is invalid: 'C:\\Users\\truon\\AppData\\Local\\Temp\\tmp_1yi8akr\\lfs_progress'