In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys

# **Importing MNIST**

---



---



In [4]:
(X_train,Y_train),(X_test,Y_test)=tf.keras.datasets.fashion_mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
[1m29515/29515[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
[1m26421880/26421880[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
[1m5148/5148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz
[1m4422102/4422102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [7]:
X_train=X_train/255
X_test=X_test/255

# **Defining VIT architecture**

---



---



In [8]:
class Attention(tf.keras.layers.Layer):
  def __init__(self,d_model,num_heads,rate):
    super(Attention,self).__init__()
    self.d_model=d_model
    self.num_heads=num_heads
    self.Key=tf.keras.layers.Dense(d_model)
    self.Query=tf.keras.layers.Dense(d_model)
    self.Value=tf.keras.layers.Dense(d_model)
    self.dropout=tf.keras.layers.Dropout(rate=rate)
    self.norm=tf.keras.layers.LayerNormalization(epsilon=1e-6)

  def AttScore(self,K,Q,V,batch_size):
    kq=tf.matmul(Q,K,transpose_b=True)
    kq=kq/tf.sqrt(tf.cast(self.d_model,tf.float32))
    soft=tf.nn.softmax(kq,axis=-1)
    return tf.matmul(soft,V)
                                # [ Batch_size , seq_len , d_model ] -> [ Batch_size , num_heads , seq_len , depth ]
  def split_heads(self,inputs,batch_size):        # [ Batch_size , image_height , image_width ] -> [ Batch_size , number_of_patches , patch_height , patch_width ]
    X=tf.reshape(inputs,(batch_size,-1,self.num_heads,self.d_model//self.num_heads))
    return tf.transpose(X,perm=[0,2,1,3])

  def call(self,inputs,training):
    batch_size=tf.shape(inputs)[0]
    K=self.Key(inputs)
    Q=self.Query(inputs)
    V=self.Value(inputs)
    key=self.split_heads(K,batch_size)
    query=self.split_heads(Q,batch_size)
    value=self.split_heads(V,batch_size)
    AttScore=self.AttScore(key,query,value,batch_size)
    AttScore=tf.transpose(AttScore,perm=[0,2,1,3])
    AttScore=tf.reshape(AttScore,(batch_size,-1,self.d_model))
    AttScore=self.dropout(AttScore,training=training)
    return self.norm(inputs+AttScore)

In [9]:
class PointwiseFFN(tf.keras.layers.Layer):
  def __init__(self,dff,d_model,rate):
    super(PointwiseFFN,self).__init__()
    self.d_model=d_model
    self.dense1=tf.keras.layers.Dense(dff,activation='relu')
    self.dense2=tf.keras.layers.Dense(d_model )
    self.dropout=tf.keras.layers.Dropout(rate=rate)
    self.norm=tf.keras.layers.LayerNormalization(epsilon=1e-6)

  def call(self,inputs,training):
    X=self.dense1(inputs)
    X=self.dense2(X)
    X=self.dropout(X,training=training)
    return self.norm(inputs+X)

In [10]:
class VITLayer(tf.keras.layers.Layer):
  def __init__(self,d_model,num_heads,dff,rate):
    super(VITLayer,self).__init__()
    self.mha=Attention(d_model,num_heads,rate)
    self.FFN=PointwiseFFN(dff,d_model,rate)

  def call(self,inputs,training):
    X=self.mha(inputs,training=training)
    X=self.FFN(X,training=training)
    return X

In [11]:
class VIT(tf.keras.models.Model):
  def __init__(self,d_model,num_heads,dff,num_layers,num_patches,rate):
    super(VIT,self).__init__()
    self.d_model=d_model
    self.num_layers=num_layers
    self.num_patches=num_patches
    self.embedding=tf.keras.layers.Dense(d_model)
    self.PosEn=self.PositionalEn(num_patches,d_model)
    self.enc_layers=[VITLayer(d_model,num_heads,dff,rate) for _ in range(num_layers)]
    self.global_pool = tf.keras.layers.GlobalAveragePooling1D()
    self.dense1=tf.keras.layers.Dense(64,activation='relu')
    self.dense2=tf.keras.layers.Dense(10,activation='softmax')

  def PositionalEn(self,num_patches,d_model):
    angles=self.GetAngle(np.arange(num_patches)[:,np.newaxis],np.arange(d_model)[np.newaxis,:],d_model)
    angles[:,0::2]=np.sin(angles[:,0::2])
    angles[:,1::2]=np.cos(angles[:,1::2])
    pos_en=angles[np.newaxis,...]
    return tf.cast(pos_en,dtype=tf.float32)

  def GetAngle(self,pos,i,d_model):
    angle_rates=1/np.power(10000,(2*(i//2))/np.float32(d_model))
    return pos*angle_rates

  def call(self,inputs,training=False):
    num_patches=tf.shape(inputs)[1]
    X=self.embedding(inputs)
    X+=self.PosEn[:,:num_patches,:]
    for i in range(self.num_layers):
      X=self.enc_layers[i](X,training=training)
    X=self.global_pool(X)
    X=self.dense1(X)
    return self.dense2(X)


In [12]:
d_model=150
num_heads=5
dff=256
num_layers=2
num_patches=4
rate=0.0
model=VIT(d_model,num_heads,dff,num_layers,num_patches,rate)

# **Patching Images**

---



---



In [13]:
def patch_image(inputs,patch_size=[14,14]):        # [ batch_size , image_height , image_width ] -> [ batch_size , num_patches , flatten_patch ]
  image_shape=inputs.shape
  assert (image_shape[1]*image_shape[2])%(patch_size[0]*patch_size[1])==0,'image size is not divisible by patch size'
  num_patches=(image_shape[1]*image_shape[2])//(patch_size[0]*patch_size[1])
  return np.reshape(inputs,(image_shape[0],num_patches,(image_shape[1]*image_shape[2])//num_patches))

In [14]:
X_trainPatch=patch_image(X_train)
X_testPatch=patch_image(X_test)

In [16]:
trainDataset = tf.data.Dataset.from_tensor_slices((X_trainPatch, Y_train))
testDataset = tf.data.Dataset.from_tensor_slices((X_testPatch, Y_test))
trainDataset = trainDataset.shuffle(buffer_size=1024).batch(64,drop_remainder=True).cache().prefetch(tf.data.AUTOTUNE)
testDataset = testDataset.batch(64,drop_remainder=True).cache().prefetch(tf.data.AUTOTUNE)

# **Compile and train model**

---



---



In [17]:
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

In [19]:
model.fit(trainDataset,epochs=8,validation_data=testDataset)

Epoch 1/8
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.8891 - loss: 0.2964 - val_accuracy: 0.8721 - val_loss: 0.3565
Epoch 2/8
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.8921 - loss: 0.2844 - val_accuracy: 0.8641 - val_loss: 0.3679
Epoch 3/8
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8964 - loss: 0.2764 - val_accuracy: 0.8704 - val_loss: 0.3482
Epoch 4/8
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.8998 - loss: 0.2601 - val_accuracy: 0.8735 - val_loss: 0.3636
Epoch 5/8
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.9033 - loss: 0.2535 - val_accuracy: 0.8772 - val_loss: 0.3499
Epoch 6/8
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.9074 - loss: 0.2454 - val_accuracy: 0.8778 - val_loss: 0.3421
Epoch 7/8
[1m937/937[0m [32m━━━

<keras.src.callbacks.history.History at 0x7f2624d07d00>