<a href="https://colab.research.google.com/github/SalarShafiee/Image-classification/blob/main/ViT_implementation_(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model

In [2]:
if __name__=='__main__':
    config={}
    config['num_heads']=12
    config['num_layers']=12
    config['hidden_dim']=48
    config['mlp_dim']=3072
    config['dropout_rate']=0.3
    config['num_patches']=64
    config['patch_size']=4
    config['num_channels']=3


In [3]:
import tensorflow as tf
from tensorflow.keras.layers import Layer, Dense, Dropout, LayerNormalization, MultiHeadAttention, Add, Embedding, Concatenate, Input
from tensorflow.keras.models import Model

# -------------------------------------------------------------------
# Class token
class Classtoken(Layer):
    def __init__(self):
        super().__init__()

    def build(self, input_shape):
        w_init = tf.keras.initializers.RandomNormal()
        self.w = self.add_weight(
            shape=(1, 1, input_shape[-1]),
            initializer=w_init,
            trainable=True
        )

    def call(self, inputs):
        batch_size = tf.shape(inputs)[0]
        hidden_dim = self.w.shape[-1]
        cls = tf.broadcast_to(self.w, [batch_size, 1, hidden_dim])
        cls = tf.cast(cls, dtype=inputs.dtype)
        return cls

# -------------------------------------------------------------------
# MLP block (instead of creating Dense+Dropout each call)
class MLPBlock(Layer):
    def __init__(self, cf):
        super().__init__()
        self.fc1 = Dense(cf['mlp_dim'], activation="gelu")
        self.drop1 = Dropout(cf['dropout_rate'])
        self.fc2 = Dense(cf['hidden_dim'])
        self.drop2 = Dropout(0.2)

    def call(self, x):
        x = self.fc1(x)
        x = self.drop1(x)
        x = self.fc2(x)
        x = self.drop2(x)
        return x

# -------------------------------------------------------------------
# Transformer encoder block
class TransformerEncoder(Layer):
    def __init__(self, cf):
        super().__init__()
        self.norm1 = LayerNormalization()
        self.attn = MultiHeadAttention(num_heads=cf['num_heads'], key_dim=cf['hidden_dim'])
        self.add1 = Add()

        self.norm2 = LayerNormalization()
        self.mlp = MLPBlock(cf)
        self.add2 = Add()

    def call(self, x):
        # Attention
        skip = x
        x = self.norm1(x)
        x = self.attn(x, x)
        x = self.add1([skip, x])

        # MLP
        skip = x
        x = self.norm2(x)
        x = self.mlp(x)
        x = self.add2([skip, x])
        return x

# -------------------------------------------------------------------
# Vision Transformer
def ViT(cf):
    # Input
    Input_shape = (cf['num_patches'], cf['patch_size']*cf['patch_size']*cf['num_channels'])
    inputs = Input(Input_shape)  # (None, 256, 3072) for big ViT, or (None, 64, 48) for CIFAR

    # Patch embedding
    patch_embed = Dense(cf['hidden_dim'])(inputs)
    positions = tf.range(start=0, limit=cf['num_patches'], delta=1)
    pos_embed = Embedding(input_dim=cf['num_patches'], output_dim=cf['hidden_dim'])(positions)
    x = patch_embed + pos_embed

    # Class token
    cls_token = Classtoken()(x)
    x = Concatenate(axis=1)([cls_token, x])

    # Transformer encoders
    for _ in range(cf['num_layers']):
        x = TransformerEncoder(cf)(x)

    # Head
    x = LayerNormalization()(x)
    x = x[:, 0, :]  # CLS token
    x = Dropout(0.1)(x)
    outputs = Dense(10, activation="softmax")(x)

    return Model(inputs, outputs)


In [4]:
model=ViT(config)

In [5]:
model.summary()

In [6]:
import tensorflow as tf
from tensorflow.keras.datasets import cifar10
from sklearn.model_selection import train_test_split

# Load CIFAR-10
(x_train, y_train), (x_test, y_test) = cifar10.load_data()

# Normalize to [0,1]
x_train = x_train.astype("float32") / 255.0
x_test = x_test.astype("float32") / 255.0

# Split train into train+val
x_train, x_val, y_train, y_val = train_test_split(
    x_train, y_train, test_size=0.1, random_state=42
)

print("Train:", x_train.shape, y_train.shape)
print("Val:  ", x_val.shape, y_val.shape)
print("Test: ", x_test.shape, y_test.shape)


Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
[1m170498071/170498071[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 0us/step
Train: (45000, 32, 32, 3) (45000, 1)
Val:   (5000, 32, 32, 3) (5000, 1)
Test:  (10000, 32, 32, 3) (10000, 1)


In [7]:

from tensorflow.keras.optimizers import Adam


model.compile(
    optimizer=Adam(learning_rate=3e-4),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)


In [8]:
import tensorflow as tf

# take CIFAR-10 images (batch, 32, 32, 3) and convert to patches
def preprocess_cifar_for_vit(images, patch_size=4):
    # extract non-overlapping patches of size 4x4
    patches = tf.image.extract_patches(
        images=images,
        sizes=[1, patch_size, patch_size, 1],
        strides=[1, patch_size, patch_size, 1],
        rates=[1, 1, 1, 1],
        padding="VALID"
    )  # (batch, 8, 8, patch_size*patch_size*3)

    # flatten grid of patches → sequence
    patches = tf.reshape(patches, [tf.shape(images)[0], -1, patch_size*patch_size*3])
    return patches


In [9]:
x_train_patches = preprocess_cifar_for_vit(x_train)
x_val_patches   = preprocess_cifar_for_vit(x_val)
x_test_patches  = preprocess_cifar_for_vit(x_test)

print(x_train_patches.shape)  # (50000, 64, 48)
print(x_val_patches.shape)    # (5000, 64, 48)
print(x_test_patches.shape)   # (5000, 64, 48)


(45000, 64, 48)
(5000, 64, 48)
(10000, 64, 48)


In [10]:
model.summary()

In [16]:
history=model.fit(
    x=x_train_patches,
    y=y_train,
    batch_size=32,
    epochs=1,
    verbose="auto",
    callbacks=None,
    validation_data=(x_val_patches,y_val),
)

[1m1407/1407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 95ms/step - accuracy: 0.3379 - loss: 1.7215 - val_accuracy: 0.3958 - val_loss: 1.5833
