In [1]:
from transformers import AutoImageProcessor, ResNetForImageClassification
import torch
from pathlib import Path 
from PIL import Image

### Ładowanie modelu
Pobieramy wytrenowany model oraz odpowiedni preprocessing - ważne jest załadowanie i użycie dokładnie tego samego preprocesowania.

In [2]:
processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
model = ResNetForImageClassification.from_pretrained("microsoft/resnet-50")

In [44]:
DATA_PATH = Path('../data/study')
DATASET_PATH = DATA_PATH / 'flowers-102'
DATASET_IMAGES_PATH = DATASET_PATH / 'jpg'

In [22]:
orig_img = Image.open(DATASET_IMAGES_PATH / 'image_00001.jpg')

In [23]:
inputs = processor(orig_img, return_tensors="pt")

In [24]:
inputs

{'pixel_values': tensor([[[[-0.4739, -0.4739, -0.4739,  ...,  0.1426,  0.1426,  0.1083],
          [-0.4397, -0.4739, -0.4739,  ...,  0.2282,  0.2624,  0.2624],
          [-0.3883, -0.4226, -0.4568,  ...,  0.3652,  0.3481,  0.3309],
          ...,
          [-1.3130, -1.2959, -1.3130,  ..., -1.4158, -1.4329, -1.4500],
          [-1.3815, -1.3815, -1.3644,  ..., -1.3473, -1.2959, -1.3130],
          [-1.4500, -1.4500, -1.4500,  ..., -1.0904, -1.0562, -1.1075]],

         [[ 0.4853,  0.5028,  0.5903,  ...,  0.5378,  0.5028,  0.4503],
          [ 0.5378,  0.5378,  0.5903,  ...,  0.6954,  0.6779,  0.6254],
          [ 0.5903,  0.5728,  0.5903,  ...,  0.8529,  0.8004,  0.7654],
          ...,
          [-1.0203, -1.0028, -1.0028,  ..., -1.2129, -1.2304, -1.2654],
          [-1.0903, -1.0903, -1.1253,  ..., -1.0553, -1.0553, -1.1604],
          [-1.1779, -1.1779, -1.2129,  ..., -0.7927, -0.8277, -0.9503]],

         [[-1.4907, -1.5256, -1.5779,  ..., -0.3404, -0.3404, -0.3927],
          [-1

In [10]:
with torch.no_grad():
    outputs = model(**inputs)

In [11]:
outputs

ImageClassifierOutputWithNoAttention(loss=None, logits=tensor([[-10.0574, -10.3454,  -9.8933, -10.4813, -10.6766,  -8.7468, -10.2083,
          -9.3474,  -9.1097,  -9.1744,  -7.5113,  -5.0071,  -7.2867,  -8.5546,
          -7.8530,  -7.9581,  -8.2434,  -9.2791, -10.1757,  -7.4227,  -9.5110,
          -9.5543, -12.3082,  -9.3422,  -9.4581,  -9.2943,  -8.4687,  -7.5100,
          -8.4294,  -8.0548,  -8.2433,  -7.3378,  -8.4285,  -9.8218,  -8.7632,
         -10.1914,  -9.6576,  -9.8403,  -7.8900,  -8.2238,  -7.9646,  -8.6409,
          -7.8016,  -6.8368,  -8.4569,  -8.6251,  -7.5550,  -8.6631,  -9.7083,
          -9.5191,  -9.1038,  -8.3499,  -8.1870,  -8.1144,  -7.9161,  -7.7026,
          -8.7816, -10.0170,  -9.5603,  -8.2470,  -9.1323,  -8.5047,  -9.1166,
          -9.8985,  -7.9680,  -8.3515, -10.1067,  -8.9973,  -9.7071,  -8.4327,
          -7.3089,  -8.8426,  -7.1491,  -6.6573,  -7.3913,  -8.1217,  -7.5213,
          -8.4534,  -7.9589,  -7.1854,  -8.5576,  -9.7972,  -9.5080,  -9.595

Logit'y - można rozumieć jako zdenormalizowane prawdopodobieństwa (z zakresu [$-\infty$, $\infty$])

In [13]:
logits = outputs.logits
logits.shape

torch.Size([1, 1000])

In [14]:
# model przewiduje jedną z 1000 klas 'ImageNet'
predicted_label = logits.argmax(-1).item()
print(model.config.id2label[predicted_label])

bee


In [18]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="../models/test_trainer", eval_strategy="epoch")

Ładujemy odpowiednią metrykę

In [16]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [17]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [85]:
import os 
files = [str(DATASET_IMAGES_PATH / name) for name in os.listdir(DATASET_IMAGES_PATH)]

In [86]:
import scipy.io
labels = scipy.io.loadmat(DATASET_PATH / 'imagelabels.mat')['labels'][0]

In [87]:
from sklearn.model_selection import train_test_split

train_images_paths, images, train_labels, labels = train_test_split(files, labels, test_size=0.3, random_state=42, stratify=labels)
eval_images_paths, test_images_paths, eval_labels, test_labels = train_test_split(images, labels, test_size=0.5, random_state=42, stratify=labels)

Przygotowany dataset składa się ze zdjęć i klas

In [88]:
from datasets import Dataset, Features, Image, ClassLabel

def preprocess(image):
    return processor(image["image"], return_tensors="pt")

def prepare(files, labels):
    features = Features({"image": Image(), "label": ClassLabel(num_classes=103)})
    ds = Dataset.from_dict({"image": files, "label": labels}, features=features) 
    ds = ds.with_format("torch")
    return ds.map(preprocess, batched=True)

In [89]:
train_dataset = prepare(train_images_paths, train_labels)

Map:   0%|          | 0/5732 [00:00<?, ? examples/s]

In [57]:
train_dataset

Dataset({
    features: ['image', 'labels', 'pixel_values'],
    num_rows: 7000
})

In [90]:
test_dataset = prepare(test_images_paths, test_labels)

Map:   0%|          | 0/1229 [00:00<?, ? examples/s]

In [91]:
eval_dataset = prepare(eval_images_paths, eval_labels)

Map:   0%|          | 0/1228 [00:00<?, ? examples/s]

In [92]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

Potrzebne jest konto na [wandb](https://wandb.ai/site)

In [None]:
trainer.train()

  0%|          | 0/2151 [00:00<?, ?it/s]

{'loss': 6.6118, 'grad_norm': 66.697509765625, 'learning_rate': 3.8377498837749884e-05, 'epoch': 0.7}


  0%|          | 0/154 [00:00<?, ?it/s]

{'eval_loss': 5.094245433807373, 'eval_accuracy': 0.07241659886086249, 'eval_runtime': 14.589, 'eval_samples_per_second': 84.242, 'eval_steps_per_second': 10.556, 'epoch': 1.0}
{'loss': 4.9054, 'grad_norm': 63.19883346557617, 'learning_rate': 2.6754997675499772e-05, 'epoch': 1.39}


  0%|          | 0/154 [00:00<?, ?it/s]

{'eval_loss': 4.249615669250488, 'eval_accuracy': 0.14483319772172498, 'eval_runtime': 14.2834, 'eval_samples_per_second': 86.044, 'eval_steps_per_second': 10.782, 'epoch': 2.0}
{'loss': 4.3053, 'grad_norm': 42.50358963012695, 'learning_rate': 1.5132496513249652e-05, 'epoch': 2.09}
{'loss': 3.9857, 'grad_norm': 65.10018157958984, 'learning_rate': 3.509995350999535e-06, 'epoch': 2.79}


  0%|          | 0/154 [00:00<?, ?it/s]

{'eval_loss': 4.038870334625244, 'eval_accuracy': 0.1887713588283157, 'eval_runtime': 49.1043, 'eval_samples_per_second': 25.028, 'eval_steps_per_second': 3.136, 'epoch': 3.0}
{'train_runtime': 397.1177, 'train_samples_per_second': 43.302, 'train_steps_per_second': 5.417, 'train_loss': 4.875778503276092, 'epoch': 3.0}


TrainOutput(global_step=2151, training_loss=4.875778503276092, metrics={'train_runtime': 397.1177, 'train_samples_per_second': 43.302, 'train_steps_per_second': 5.417, 'total_flos': 3.969231186369577e+17, 'train_loss': 4.875778503276092, 'epoch': 3.0})

In [94]:
trainer.evaluate(eval_dataset)

  0%|          | 0/154 [00:00<?, ?it/s]

{'eval_loss': 4.027404308319092,
 'eval_accuracy': 0.18403908794788273,
 'eval_runtime': 17.935,
 'eval_samples_per_second': 68.469,
 'eval_steps_per_second': 8.587,
 'epoch': 3.0}