In [1]:
! pip install datasets transformers
! pip install accelerate -U
! pip install datasets

In [2]:
import os
import numpy as np
import cv2
import datasets
from datasets import load_dataset
import torch
import torch.nn as nn
import torchvision
from torch.utils.data import Dataset, DataLoader, Subset, random_split
import torchvision.transforms as transforms
import datasets
import pandas as pd
from datasets import load_dataset, DatasetDict, load_metric
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, accuracy_score
import matplotlib.pyplot as plt
from torchvision.utils import make_grid 
import time
from transformers import ViTFeatureExtractor, AutoImageProcessor

In [3]:
# Random seed for PyTorch
torch.manual_seed(42)

In [4]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        os.path.join(dirname, filename)

In [5]:
# Device configuration
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

In [6]:
# Load dataset
img_path = "/kaggle/input/font-style/data"
data = load_dataset("imagefolder", data_dir=img_path)

In [7]:
# random image
data['train'][1001]['image']

In [8]:
# Generate all labels
labels = data['train'].features['label']
labels

In [9]:
# Split data into train, test validation --
data_ds = data['train'].train_test_split(shuffle = True, seed = 0, test_size=0.3)
data_test_ds = data_ds['test'].train_test_split(shuffle = True, seed = 0, test_size=0.5)

final_dataset = DatasetDict({
    'train': data_ds['train'],
    'validation': data_test_ds['train'],
    'test': data_test_ds['test']
})
final_dataset

In [10]:
model_path = 'google/vit-base-patch16-224-in21k'
feature_extractor = ViTFeatureExtractor.from_pretrained(model_path)

In [11]:
# final_dataset have raw images, convert into embeddings
def transform(batch):
    inputs = feature_extractor([x for x in batch['image']], return_tensors='pt')
    inputs['label'] = batch['label']
    return inputs

prepared_ds = final_dataset.with_transform(transform)

In [12]:
prepared_ds

In [13]:
prepared_ds['train'].features

In [14]:
def collateFunction(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['label'] for x in batch])
    }

In [15]:
metric = load_metric("accuracy")

# compute accuracy
def computeMetrics(p):
    return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)

In [16]:
# ViT for Image classification model
from transformers import ViTForImageClassification
from transformers import AutoImageProcessor


labels = prepared_ds['train'].features['label'].names

model = ViTForImageClassification.from_pretrained(
    model_path,
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)}
)

In [17]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="./vit-base",
  per_device_train_batch_size=16,
  evaluation_strategy="steps",
  num_train_epochs=5,
  save_steps=100,
  eval_steps=100,
  logging_steps=10,
  learning_rate=2e-4,
  save_total_limit=2,
  remove_unused_columns=False,
  push_to_hub=False,
  report_to='tensorboard',
  load_best_model_at_end=True,
)

In [18]:
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collateFunction,
    compute_metrics=computeMetrics,
    train_dataset=prepared_ds["train"],
    eval_dataset=prepared_ds["validation"],
    tokenizer=feature_extractor,
)

In [19]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

In [20]:
def softmax(logits):
    exp_logits = np.exp(logits)
    probabilities = exp_logits / np.sum(exp_logits)
    return probabilities

In [22]:
output = trainer.predict(prepared_ds['test'])

In [23]:
output.predictions.shape

In [24]:
label_ordering = prepared_ds['train'].features['label'].names

target_labels = output.label_ids
logits = output.predictions
log_probs = softmax(logits)
prediction_labels = np.argmax(log_probs, axis=-1)
print(f"Accuracy ->  {accuracy_score(target_labels, prediction_labels)}")