In [None]:
# !pip install transformers datasets evaluate

In [None]:
import torch
from PIL import Image
from transformers import ViTImageProcessor, ViTForImageClassification, TrainingArguments, Trainer
from tqdm import tqdm
import os
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import json
from evaluate import load, list_evaluation_modules
import numpy as np

import random
import matplotlib.pyplot as plt

device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
mainPath = './dataset'
# mainPath = "/content/drive/MyDrive/Bakalaurinis/Dataset/"
trainPath = os.path.join(mainPath, 'train_images')
csvPath = os.path.join(mainPath, 'train.csv')
df = pd.read_csv(csvPath)
df.rename(columns={'image_id': 'image'}, inplace=True)
df.label = df.label.astype('string')

In [None]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=1)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

In [None]:
model_name = "google/vit-base-patch16-384"
image_processor = ViTImageProcessor.from_pretrained(model_name)
model = ViTForImageClassification.from_pretrained(model_name)

Downloading (…)rocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/348M [00:00<?, ?B/s]

In [None]:
def get_prediction(model, img_path):
  img = Image.open(os.path.join(trainPath, img_path))
  pixel_values = image_processor(img, return_tensors="pt")["pixel_values"].to(device)
  model.to(device)
  output = model(pixel_values)
  return model.config.id2label[int(output.logits.softmax(dim=1).argmax())]

In [None]:
datasetTrain = Dataset.from_pandas(train_df).class_encode_column("label")
datasetVal = Dataset.from_pandas(val_df).class_encode_column("label")
dataset = DatasetDict({"train": datasetTrain, "validation": datasetVal})

In [None]:
def transform(examples):
  inputs = image_processor([Image.open(os.path.join(trainPath, img)).convert("RGB") for img in examples["image"]], return_tensors="pt")
  inputs["labels"] = examples["label"]
  return inputs

dataset = dataset.with_transform(transform)

In [None]:
print(dataset['train'][0]['pixel_values'].shape)
print(dataset['train'][0]['labels'])

torch.Size([3, 384, 384])
3


In [None]:
labels = dataset["train"].features["label"].names

In [None]:
def collate_fn(batch):
  return {
      "pixel_values": torch.stack([x["pixel_values"] for x in batch]),
      "labels": torch.tensor([x["labels"] for x in batch]),
  }

In [None]:
accuracy = load("accuracy")
f1 = load("f1")

def compute_metrics(eval_pred):
  accuracy_score = accuracy.compute(predictions=np.argmax(eval_pred.predictions, axis=1), references=eval_pred.label_ids)
  f1_score = f1.compute(predictions=np.argmax(eval_pred.predictions, axis=1), references=eval_pred.label_ids, average="macro")
  return {**accuracy_score, **f1_score}

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [None]:
model = ViTForImageClassification.from_pretrained(
    model_name,
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)},
    ignore_mismatched_sizes=True,
)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-384 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
  output_dir=os.path.join(mainPath, "ViT"),
  per_device_train_batch_size=32,
  evaluation_strategy="steps",
  num_train_epochs=10,
  save_steps=1000,
  eval_steps=1000,
  logging_steps=1000,
  save_total_limit=2,
  remove_unused_columns=False,
  push_to_hub=False,
  report_to='tensorboard',
  load_best_model_at_end=True,
  optim='adamw_torch'
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=image_processor,
)

In [None]:
print(f'Images in trainPath = {len(os.listdir(trainPath))}')

Images in trainPath = 21397


In [None]:
result = trainer.train()