<a href="https://colab.research.google.com/github/Satwikram/Computer-Vision-Implementations/blob/main/Vision%20Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Author: Satwik Ram K

### Setup

In [None]:
!pip install transformers

In [2]:
from google.colab import files

files.upload()

! mkdir ~/.kaggle

! cp kaggle.json ~/.kaggle/

! chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


### Downloading Dataset

In [3]:
!kaggle datasets download -d shaunthesheep/microsoft-catsvsdogs-dataset

Downloading microsoft-catsvsdogs-dataset.zip to /content
100% 786M/788M [00:35<00:00, 25.1MB/s]
100% 788M/788M [00:35<00:00, 23.4MB/s]


In [None]:
!unzip /content/microsoft-catsvsdogs-dataset.zip

### Imports

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import string
import re

import os
from pathlib import Path

import matplotlib.pyplot as plt
import cv2
import tensorflow as tf

from tensorflow.keras.layers import Dense, Input, Flatten, Conv2D, MaxPool2D, GlobalAveragePooling1D, BatchNormalization, Embedding, Bidirectional, LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, TensorBoard

# import keras_nlp


from sklearn.model_selection import train_test_split

from unicodedata import normalize

from tqdm import tqdm

from PIL import Image

from transformers import ViTFeatureExtractor, TFViTForImageClassification

import plotly.express as px
from datetime import date

from sklearn.metrics import *

import joblib

### Global Variables

In [18]:
img_shape = (224, 224)

dog_path = Path("/content/PetImages/Dog")
cat_path = Path("/content/PetImages/Cat")

uniq_labels = ["cat", "dog"]

today = date.today()

### Cleaning Image

In [19]:
def clean_img(fname):

  img = cv2.imread(fname)

  img = cv2.resize(img, img_shape) 

  # Normalization
  img = img/255.0

  return img

### Image Extractor

In [20]:
def extract_image(path, target):

  count = 0

  X = []
  y = []

  for img in os.listdir(path):

    if count <=500:

      _, tail = os.path.splitext(img)

      if tail in [".jpg", ".jpeg", ".png"]:

        fname = f"{path}/{img}"

        try:
          # Cleaning the Image
          img = clean_img(fname)
          X.extend([img])
          y.extend([target])
          count += 1
        
        except Exception as e: print(f"Excepttion: {e}") 

  return X, y

In [21]:
X_dog, y_dog = extract_image(dog_path, 0)
X_cat, y_cat = extract_image(cat_path, 1)

Excepttion: OpenCV(4.6.0) /io/opencv/modules/imgproc/src/resize.cpp:4052: error: (-215:Assertion failed) !ssize.empty() in function 'resize'

Excepttion: OpenCV(4.6.0) /io/opencv/modules/imgproc/src/resize.cpp:4052: error: (-215:Assertion failed) !ssize.empty() in function 'resize'



In [22]:
X = X_dog + X_cat
y = y_dog + y_cat

### Splitting Data Into Train/Test

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)

In [84]:
y_train = np.array(y_train)
y_test = np.array(y_test)

### Feature Extraction

In [55]:
checkpoint = "google/vit-base-patch16-224"

In [56]:
def feature_extraction(samples):
  
  feature_extractor = ViTFeatureExtractor.from_pretrained(checkpoint)

  return feature_extractor(samples, return_tensors="np")["pixel_values"]

In [57]:
X_train = feature_extraction(X_train)
X_test = feature_extraction(X_test)

### Building the model

In [85]:
def build_model(targets, checkpoint, input_shape):

  base_model = TFViTForImageClassification.from_pretrained(checkpoint)

  pixel_values = Input(shape=input_shape, name="pixel_values")

  x = base_model.vit(pixel_values=pixel_values)[0]

  x = Flatten()(x)

  units = len(np.unique(targets))

  if units > 2:
      activation = "softmax"
      loss = "sparse_categorical_crossentropy"
  else:
      activation = "sigmoid"
      loss = "binary_crossentropy"
      units = units - 1

  outputs = Dense(units, activation = activation, name = "outputs")(x)

  model = Model(inputs=pixel_values, outputs=outputs)

  optimizer =  tf.keras.optimizers.Adam()

  model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])

  # Model Architecture Export
  tf.keras.utils.plot_model(model, to_file='model.png', show_shapes=True, 
                          show_dtype=True, show_layer_names=True, rankdir='TB',
                          expand_nested=True, dpi=300, layer_range=None, 
                          show_layer_activations=True)

  return model

In [86]:
model = build_model(y_train, checkpoint, X_train[0].shape)

All model checkpoint layers were used when initializing TFViTForImageClassification.

All the layers of TFViTForImageClassification were initialized from the model checkpoint at google/vit-base-patch16-224.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFViTForImageClassification for predictions without further training.


In [None]:
model.fit(X_train, y_train, validation_data = (X_test, y_test), epochs = 3, batch_size = 32)

Epoch 1/3
Epoch 2/3
Epoch 3/3