In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import os
import random
from PIL import Image
from tqdm import tqdm, tqdm_notebook

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from transformers import Trainer, TrainingArguments
from transformers import AutoImageProcessor, AutoModelForImageClassification

import warnings
warnings.filterwarnings("ignore")
os.environ["WANDB_MODE"] = "disabled"
os.environ["WANDB_DISABLED"] = "true"

## Preparing data for training

In [2]:
model_path = 'google/vit-base-patch16-224'
processor = AutoImageProcessor.from_pretrained(model_path)

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


In [3]:
img = Image.open('/kaggle/input/image-classification/train/train/cactus/image_00188.jpg')
processor(img, return_tensors='pt')

{'pixel_values': tensor([[[[-0.0353, -0.0275, -0.0196,  ...,  0.2549,  0.2549,  0.2549],
          [-0.0353, -0.0275, -0.0196,  ...,  0.2549,  0.2549,  0.2549],
          [-0.0353, -0.0275, -0.0196,  ...,  0.2549,  0.2549,  0.2549],
          ...,
          [-0.1686, -0.1686, -0.1686,  ...,  0.1216,  0.1216,  0.1216],
          [-0.1765, -0.1765, -0.1765,  ...,  0.1137,  0.1137,  0.1137],
          [-0.1843, -0.1843, -0.1843,  ...,  0.1059,  0.1059,  0.1059]],

         [[-0.0510, -0.0431, -0.0353,  ...,  0.2549,  0.2549,  0.2549],
          [-0.0510, -0.0431, -0.0353,  ...,  0.2549,  0.2549,  0.2549],
          [-0.0510, -0.0431, -0.0353,  ...,  0.2549,  0.2549,  0.2549],
          ...,
          [-0.1843, -0.1843, -0.1843,  ...,  0.1216,  0.1216,  0.1216],
          [-0.1922, -0.1922, -0.1922,  ...,  0.1137,  0.1137,  0.1137],
          [-0.2000, -0.2000, -0.2000,  ...,  0.1059,  0.1059,  0.1059]],

         [[ 0.0510,  0.0588,  0.0667,  ...,  0.3490,  0.3490,  0.3490],
          [ 0

In [4]:
class_to_label = {
    0: 'cactus',
    1: 'fern',
    2: 'rose',
    3: 'sunflower',
    4: 'tulip'
}

label_to_class = {name: idx for idx, name in class_to_label.items()}

In [5]:
class ImgDataset(Dataset):
    def __init__(self, img_path, processor):
        self.paths = []
        self.labels = []
        for label in os.listdir(img_path):
            label_path = os.path.join(img_path, label)
            for image in os.listdir(label_path):
                self.paths.append(os.path.join(label_path, image))
                self.labels.append(label_to_class[label])
        self.processor = processor

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):      
        image = Image.open(self.paths[idx])
        item = self.processor(image, return_tensors='pt')
        item["pixel_values"] = item["pixel_values"].squeeze(0)
        item["labels"] = self.labels[idx]
        return item

In [6]:
train_dataset = ImgDataset('/kaggle/input/image-classification/train/train', processor)
len(train_dataset)

2400

In [7]:
train_dataset[0]

{'pixel_values': tensor([[[-0.6392, -0.6863, -0.7020,  ..., -0.8980, -0.8667, -0.8588],
         [-0.6157, -0.6549, -0.6941,  ..., -0.9137, -0.8824, -0.8902],
         [-0.6392, -0.6157, -0.6078,  ..., -0.9686, -0.9608, -0.8980],
         ...,
         [-0.3804, -0.3490, -0.2706,  ...,  0.4588,  0.4588,  0.4039],
         [-0.3647, -0.2863, -0.2000,  ...,  0.5529,  0.5451,  0.4824],
         [-0.3725, -0.3098, -0.2000,  ...,  0.4980,  0.5765,  0.5686]],

        [[ 0.0980,  0.0745,  0.0902,  ...,  0.1608,  0.1686,  0.1608],
         [ 0.1529,  0.1294,  0.1059,  ...,  0.1608,  0.1765,  0.1608],
         [ 0.1608,  0.2000,  0.2078,  ...,  0.1137,  0.1294,  0.1843],
         ...,
         [ 0.3569,  0.2784,  0.1216,  ...,  0.1137,  0.1294,  0.1608],
         [ 0.3569,  0.3333,  0.1686,  ...,  0.0118,  0.0196,  0.0667],
         [ 0.3333,  0.2941,  0.1529,  ..., -0.1451, -0.0745,  0.0510]],

        [[ 0.2078,  0.1843,  0.2000,  ...,  0.2392,  0.2471,  0.2314],
         [ 0.2549,  0.2314, 

In [8]:
train_size = int(len(train_dataset) * 0.8)
val_size = len(train_dataset) - train_size
torch.manual_seed(42)
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])

## Fine-tuning the model

In [9]:
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['labels'] for x in batch])
    }

In [10]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    accuracy = accuracy_score(labels, preds)
    f1score = f1_score(labels, preds, average='weighted')
    return {'accuracy': accuracy, 'f1': f1score}

In [11]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [12]:
model = AutoModelForImageClassification.from_pretrained(model_path, num_labels=5, ignore_mismatched_sizes=True).to(device)
model

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([5]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fe

In [14]:
for i, layer in enumerate(model.children()):
    if i != 1:
        for param in layer.parameters():
            param.requires_grad = False

In [15]:
train_args = TrainingArguments(
    num_train_epochs=3,
    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    warmup_steps=30, # 1920 сэмплов / 16 batch size = 120 шагов в эпохе
    eval_strategy='epoch',
    lr_scheduler_type='constant',
    run_name='',
    output_dir='/kaggle/working/output',
    logging_dir='/kaggle/working/logs',
    save_strategy='epoch',
    logging_strategy='epoch',
    disable_tqdm=False,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [17]:
trainer = Trainer(
    model=model,
    args=train_args,
    tokenizer=processor,
    data_collator=collate_fn,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [18]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.0879,0.696588,0.814583,0.814739
2,0.5423,0.482571,0.860417,0.859194
3,0.4009,0.394916,0.89375,0.893092


TrainOutput(global_step=360, training_loss=0.6770513852437338, metrics={'train_runtime': 116.4108, 'train_samples_per_second': 49.48, 'train_steps_per_second': 3.092, 'total_flos': 4.463658617654477e+17, 'train_loss': 0.6770513852437338, 'epoch': 3.0})

## Making prediction

In [19]:
model.eval()

def predict_sample(img_path):
    image = Image.open(img_path)
    item = processor(image, return_tensors='pt')
    # item["pixel_values"] = item["pixel_values"].squeeze(0)
    item = item.to(device)
    with torch.no_grad():
        outputs = model(**item)
    prob = torch.nn.functional.softmax(outputs.logits, dim=1)
    pred = torch.argmax(prob, dim=1).item()
    return pred

In [23]:
sub = pd.read_csv('/kaggle/input/image-classification/sample_submission.csv')
sub.head()

Unnamed: 0,name,label
0,image_62214.jpg,
1,image_91562.jpg,
2,image_44104.jpg,
3,image_79943.jpg,
4,image_79847.jpg,


In [24]:
labels = []
paths = sub['name'].to_list()
for path in tqdm_notebook(paths):
    labels.append(class_to_label[predict_sample('/kaggle/input/image-classification/test/test/' + path)])
sub['label'] = labels
sub.to_csv('sub.csv', index=False)
sub.head()

  0%|          | 0/600 [00:00<?, ?it/s]

Unnamed: 0,name,label
0,image_62214.jpg,rose
1,image_91562.jpg,rose
2,image_44104.jpg,rose
3,image_79943.jpg,rose
4,image_79847.jpg,rose
