This notebook is using transfer learning from a ViT model from huggingface


In [31]:
import multiprocessing
import transformers
import torch
import wandb
import numpy as np
import pytorch_lightning as pl

from torchvision import transforms
from torchvision.datasets import Food101
from torchvision.datasets.utils import download_url
from transformers import AutoFeatureExtractor
from datasets import load_metric

metric = load_metric("accuracy")

sys.path.append('C:/Users/truon/Documents/projects/food/src')
from datamodule.dataloader import Food101DataModule
from model.transfer_resnet import  *

In [12]:
from datasets import load_dataset
food = load_dataset("food101", split="train[:5000]")
food = food.train_test_split(test_size=0.2)


In [14]:
labels = food["train"].features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [15]:
feature_extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")

In [16]:
from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor

normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
_transforms = Compose([RandomResizedCrop(feature_extractor.size), ToTensor(), normalize])

In [17]:
def transforms(examples):
    examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]]
    del examples["image"]
    return examples

food = food.with_transform(transforms)

In [18]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [19]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [20]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [23]:
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer

model = AutoModelForImageClassification.from_pretrained(
    "google/vit-base-patch16-224-in21k",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)

Downloading pytorch_model.bin:   0%|          | 0.00/330M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=32,
    evaluation_strategy="steps",
    num_train_epochs=10,
    fp16=True,
    save_steps=100,
    eval_steps=100,
    logging_steps=10,
    learning_rate=2e-4,
    save_total_limit=2,
    remove_unused_columns=False,
    report_to="wandb"
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=food["train"],
    eval_dataset=food["test"],
    compute_metrics = compute_metrics,
    tokenizer=feature_extractor,
)


PyTorch: setting up devices
Using cuda_amp half precision backend


In [36]:
train_results = trainer.train()
trainer.save_model(output_dir = "./models/")
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

***** Running training *****
  Num examples = 4000
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1250
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,Accuracy
100,0.1594,0.571418,0.864
200,0.0758,0.473172,0.88
300,0.0349,0.529542,0.863
400,0.0835,0.4547,0.883
500,0.1514,0.429929,0.891
600,0.0983,0.516395,0.89
700,0.0413,0.471797,0.895
800,0.0791,0.438805,0.888
900,0.0527,0.451742,0.899
1000,0.0236,0.398755,0.909


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Saving model checkpoint to ./results\checkpoint-100
Configuration saved in ./results\checkpoint-100\config.json
Model weights saved in ./results\checkpoint-100\pytorch_model.bin
Feature extractor saved in ./results\checkpoint-100\preprocessor_config.json
Deleting older checkpoint [results\checkpoint-1100] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Saving model checkpoint to ./results\checkpoint-200
Configuration saved in ./results\checkpoint-200\config.json
Model weights saved in ./results\checkpoint-200\pytorch_model.bin
Feature extractor saved in ./results\checkpoint-200\preprocessor_config.json
Deleting older checkpoint [results\checkpoint-1200] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Saving model checkpoint to ./results\checkpoint-300
Configuration saved in ./results\checkpoint-300\config.json
Model 

***** train metrics *****
  epoch                    =         10.0
  total_flos               = 2889363046GF
  train_loss               =        0.066
  train_runtime            =   0:08:20.29
  train_samples_per_second =       79.953
  train_steps_per_second   =        2.499
