In [1]:
import os
import glob

import matplotlib.pyplot as plt
from pathlib import Path
from PIL import Image
from datasets import load_dataset
from transformers import AutoImageProcessor, AutoModelForImageClassification, TrainingArguments, Trainer

d:\Python\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
d:\Python\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll


In [2]:
ROOT_DATASET_PATH = Path('./cifake/')
TRAIN_DATASET_PATH = ROOT_DATASET_PATH / 'train'
TEST_DATASET_PATH = ROOT_DATASET_PATH / 'test'

In [3]:
data = load_dataset('imagefolder', data_dir="./cifake/")

Resolving data files:   0%|          | 0/100000 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/20000 [00:00<?, ?it/s]

In [4]:
labels = data["train"].features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = i
    id2label[i] = label

In [5]:
checkpoint = "microsoft/resnet-50"
image_processor = AutoImageProcessor.from_pretrained(checkpoint)

image_processor

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


ConvNextImageProcessor {
  "crop_pct": 0.875,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.485,
    0.456,
    0.406
  ],
  "image_processor_type": "ConvNextImageProcessor",
  "image_std": [
    0.229,
    0.224,
    0.225
  ],
  "resample": 3,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "shortest_edge": 224
  }
}

In [33]:
from torchvision.transforms import (
    CenterCrop,
    Compose,
    Normalize,
    RandomHorizontalFlip,
    RandomResizedCrop,
    Resize,
    ToTensor,
)

normalize = Normalize(mean=image_processor.image_mean,
                      std=image_processor.image_std)
if "height" in image_processor.size:
    size = (image_processor.size["height"], image_processor.size["width"])
    crop_size = size
    max_size = None
elif "shortest_edge" in image_processor.size:
    size = image_processor.size["shortest_edge"]
    crop_size = (size, size)
    max_size = image_processor.size.get("longest_edge")

train_transforms = Compose(
    [
        RandomResizedCrop(crop_size),
        RandomHorizontalFlip(),
        ToTensor(),
        normalize,
    ]
)

val_transforms = Compose(
    [
        Resize(size),
        CenterCrop(crop_size),
        ToTensor(),
        normalize,
    ]
)

def preprocess_train(example_batch):
    example_batch["pixel_values"] = [
        train_transforms(image.convert("RGB")) for image in example_batch["image"]
    ]
    return example_batch

def preprocess_val(example_batch):
    example_batch["pixel_values"] = [val_transforms(
        image.convert("RGB")) for image in example_batch["image"]]
    return example_batch

train_ds = data["train"]
test_ds = data["test"]

train_ds.set_transform(preprocess_train)
test_ds.set_transform(preprocess_val)

In [35]:
model = AutoModelForImageClassification.from_pretrained(
    checkpoint,
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True,
)

Some weights of ResNetForImageClassification were not initialized from the model checkpoint at microsoft/resnet-50 and are newly initialized because the shapes did not match:
- classifier.1.weight: found shape torch.Size([1000, 2048]) in the checkpoint and torch.Size([2, 2048]) in the model instantiated
- classifier.1.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
import numpy as np
from datasets import load_metric

metric = load_metric("f1")

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [37]:
batch_size = 32
model_name = checkpoint.split("/")[-1]
epoch = 5

args = TrainingArguments(
    f"{model_name}-finetuned-cifake-epoch{epoch}",
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epoch,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

In [38]:
import torch

def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"]
                               for example in examples])
    labels = torch.tensor([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}

In [39]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)

In [41]:
import wandb

wandb.init()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msulthanabiyyu[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [42]:
train_results = trainer.train()

trainer.save_model()

trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

  0%|          | 0/3905 [00:00<?, ?it/s]

{'loss': 0.6932, 'learning_rate': 1.2787723785166241e-06, 'epoch': 0.01}
{'loss': 0.6925, 'learning_rate': 2.5575447570332483e-06, 'epoch': 0.03}
{'loss': 0.6939, 'learning_rate': 3.836317135549873e-06, 'epoch': 0.04}
{'loss': 0.6916, 'learning_rate': 5.1150895140664966e-06, 'epoch': 0.05}
{'loss': 0.6925, 'learning_rate': 6.3938618925831205e-06, 'epoch': 0.06}
{'loss': 0.6908, 'learning_rate': 7.672634271099745e-06, 'epoch': 0.08}
{'loss': 0.6917, 'learning_rate': 8.95140664961637e-06, 'epoch': 0.09}
{'loss': 0.689, 'learning_rate': 1.0230179028132993e-05, 'epoch': 0.1}
{'loss': 0.6901, 'learning_rate': 1.1508951406649617e-05, 'epoch': 0.12}
{'loss': 0.6858, 'learning_rate': 1.2787723785166241e-05, 'epoch': 0.13}
{'loss': 0.6857, 'learning_rate': 1.4066496163682865e-05, 'epoch': 0.14}
{'loss': 0.6852, 'learning_rate': 1.534526854219949e-05, 'epoch': 0.15}
{'loss': 0.6827, 'learning_rate': 1.6624040920716114e-05, 'epoch': 0.17}
{'loss': 0.6811, 'learning_rate': 1.790281329923274e-05, '

  0%|          | 0/625 [00:00<?, ?it/s]

{'eval_loss': 0.3235321044921875, 'eval_f1': 0.845711092483934, 'eval_runtime': 182.9557, 'eval_samples_per_second': 109.316, 'eval_steps_per_second': 3.416, 'epoch': 1.0}
{'loss': 0.35, 'learning_rate': 4.432270916334662e-05, 'epoch': 1.01}
{'loss': 0.3443, 'learning_rate': 4.418042117245305e-05, 'epoch': 1.02}
{'loss': 0.3533, 'learning_rate': 4.403813318155948e-05, 'epoch': 1.04}
{'loss': 0.3302, 'learning_rate': 4.389584519066591e-05, 'epoch': 1.05}
{'loss': 0.34, 'learning_rate': 4.3753557199772344e-05, 'epoch': 1.06}
{'loss': 0.3598, 'learning_rate': 4.3611269208878774e-05, 'epoch': 1.08}
{'loss': 0.3465, 'learning_rate': 4.3468981217985204e-05, 'epoch': 1.09}
{'loss': 0.3262, 'learning_rate': 4.3326693227091634e-05, 'epoch': 1.1}
{'loss': 0.3451, 'learning_rate': 4.3184405236198064e-05, 'epoch': 1.11}
{'loss': 0.3513, 'learning_rate': 4.3042117245304495e-05, 'epoch': 1.13}
{'loss': 0.3535, 'learning_rate': 4.289982925441093e-05, 'epoch': 1.14}
{'loss': 0.329, 'learning_rate': 4.

  0%|          | 0/625 [00:00<?, ?it/s]

{'eval_loss': 0.21972163021564484, 'eval_f1': 0.908125722695259, 'eval_runtime': 73.4864, 'eval_samples_per_second': 272.159, 'eval_steps_per_second': 8.505, 'epoch': 2.0}
{'loss': 0.2753, 'learning_rate': 3.322424587364826e-05, 'epoch': 2.01}
{'loss': 0.2956, 'learning_rate': 3.30819578827547e-05, 'epoch': 2.02}
{'loss': 0.3043, 'learning_rate': 3.293966989186113e-05, 'epoch': 2.04}
{'loss': 0.3227, 'learning_rate': 3.279738190096756e-05, 'epoch': 2.05}
{'loss': 0.2729, 'learning_rate': 3.265509391007399e-05, 'epoch': 2.06}
{'loss': 0.297, 'learning_rate': 3.251280591918042e-05, 'epoch': 2.07}
{'loss': 0.3339, 'learning_rate': 3.237051792828685e-05, 'epoch': 2.09}
{'loss': 0.2966, 'learning_rate': 3.222822993739329e-05, 'epoch': 2.1}
{'loss': 0.3021, 'learning_rate': 3.208594194649972e-05, 'epoch': 2.11}
{'loss': 0.3073, 'learning_rate': 3.1943653955606154e-05, 'epoch': 2.12}
{'loss': 0.2731, 'learning_rate': 3.1801365964712584e-05, 'epoch': 2.14}
{'loss': 0.3183, 'learning_rate': 3.1

  0%|          | 0/625 [00:00<?, ?it/s]

{'eval_loss': 0.1846153289079666, 'eval_f1': 0.9242236024844721, 'eval_runtime': 72.8641, 'eval_samples_per_second': 274.484, 'eval_steps_per_second': 8.578, 'epoch': 3.0}
{'loss': 0.3044, 'learning_rate': 2.2125782583949916e-05, 'epoch': 3.01}
{'loss': 0.3036, 'learning_rate': 2.1983494593056346e-05, 'epoch': 3.02}
{'loss': 0.2783, 'learning_rate': 2.184120660216278e-05, 'epoch': 3.03}
{'loss': 0.2652, 'learning_rate': 2.169891861126921e-05, 'epoch': 3.05}
{'loss': 0.2824, 'learning_rate': 2.1556630620375643e-05, 'epoch': 3.06}
{'loss': 0.2632, 'learning_rate': 2.1414342629482073e-05, 'epoch': 3.07}
{'loss': 0.285, 'learning_rate': 2.1272054638588503e-05, 'epoch': 3.08}
{'loss': 0.2621, 'learning_rate': 2.1129766647694933e-05, 'epoch': 3.1}
{'loss': 0.2566, 'learning_rate': 2.0987478656801367e-05, 'epoch': 3.11}
{'loss': 0.2855, 'learning_rate': 2.08451906659078e-05, 'epoch': 3.12}
{'loss': 0.2797, 'learning_rate': 2.070290267501423e-05, 'epoch': 3.14}
{'loss': 0.2698, 'learning_rate'

  0%|          | 0/625 [00:00<?, ?it/s]

{'eval_loss': 0.18106049299240112, 'eval_f1': 0.926023983803146, 'eval_runtime': 72.6686, 'eval_samples_per_second': 275.222, 'eval_steps_per_second': 8.601, 'epoch': 4.0}
{'loss': 0.2665, 'learning_rate': 1.1027319294251566e-05, 'epoch': 4.01}
{'loss': 0.2572, 'learning_rate': 1.0885031303357997e-05, 'epoch': 4.02}
{'loss': 0.2696, 'learning_rate': 1.0742743312464429e-05, 'epoch': 4.03}
{'loss': 0.2987, 'learning_rate': 1.0600455321570859e-05, 'epoch': 4.04}
{'loss': 0.2928, 'learning_rate': 1.0458167330677293e-05, 'epoch': 4.06}
{'loss': 0.2496, 'learning_rate': 1.0315879339783723e-05, 'epoch': 4.07}
{'loss': 0.2645, 'learning_rate': 1.0173591348890154e-05, 'epoch': 4.08}
{'loss': 0.2559, 'learning_rate': 1.0031303357996586e-05, 'epoch': 4.1}
{'loss': 0.2567, 'learning_rate': 9.889015367103018e-06, 'epoch': 4.11}
{'loss': 0.2601, 'learning_rate': 9.746727376209448e-06, 'epoch': 4.12}
{'loss': 0.282, 'learning_rate': 9.60443938531588e-06, 'epoch': 4.13}
{'loss': 0.2622, 'learning_rate

  0%|          | 0/625 [00:00<?, ?it/s]

{'eval_loss': 0.16951845586299896, 'eval_f1': 0.9315419722065534, 'eval_runtime': 74.5195, 'eval_samples_per_second': 268.386, 'eval_steps_per_second': 8.387, 'epoch': 5.0}
{'train_runtime': 6642.1472, 'train_samples_per_second': 75.277, 'train_steps_per_second': 0.588, 'train_loss': 0.33123261595810266, 'epoch': 5.0}


In [43]:
metrics = trainer.evaluate()
# some nice to haves:
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

  0%|          | 0/625 [00:00<?, ?it/s]

***** eval metrics *****
  epoch                   =        5.0
  eval_f1                 =     0.9315
  eval_loss               =     0.1695
  eval_runtime            = 0:01:11.99
  eval_samples_per_second =    277.791
  eval_steps_per_second   =      8.681
