In [None]:
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.utils import to_categorical


# Load CIFAR-10 dataset
(x_train, y_train), (x_test, y_test) = cifar10.load_data()

# Normalize the data
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0

# Convert class vectors to binary class matrices (one-hot encoding)
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)

# Load the ResNet50 model with ImageNet weights, excluding the top layer
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(32, 32, 3))

# Set all layers to be trainable
for layer in base_model.layers:
    layer.trainable = True

# Add custom layers on top of the base model
x = base_model.output
x = Flatten()(x)
x = Dense(512, activation='relu')(x)
predictions = Dense(10, activation='softmax')(x)

# Define the model
model = Model(inputs=base_model.input, outputs=predictions)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_test, y_test))

# Evaluate the model
test_loss, test_accuracy = model.evaluate(x_test, y_test)
print(f"Test accuracy: {test_accuracy}")

# Predict labels for test set
predictions = model.predict(x_test)
predicted_labels = tf.argmax(predictions, axis=1)
actual_labels = tf.argmax(y_test, axis=1)

# Print overall accuracy
accuracy = tf.reduce_mean(tf.cast(predicted_labels == actual_labels, tf.float32))
print(f"Overall accuracy: {accuracy.numpy()}")


ImageNet is an image database organized according to the WordNet hierarchy (currently only the nouns), in which each node of the hierarchy is depicted by hundreds and thousands of images. The project has been instrumental in advancing computer vision and deep learning research. The data is available for free to researchers for non-commercial use

In above code we have used ResNet 50 architecture to test accuracy, further we will be increasing the model's accuracy using some attention layers

Here, predictions is the output from the model, which is a 2D tensor where each row represents the predicted probabilities for each class for a given example. tf.argmax(predictions, axis=1) returns the index of the maximum value in each row, which corresponds to the predicted class label for each example.

# ResNet50+Attention

In [None]:

from tensorflow.keras.layers import Dense, Flatten, Input, LayerNormalization, MultiHeadAttention, Add, Dropout,GlobalAveragePooling2D


# Load CIFAR-10 dataset
(x_train, y_train), (x_test, y_test) = cifar10.load_data()

# Normalize the data
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0

# Convert class vectors to binary class matrices (one-hot encoding)
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)

# Load the ResNet50 model with ImageNet weights, excluding the top layer
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(32, 32, 3))

# Add custom layers on top of the base model
x = base_model.output
x =GlobalAveragePooling2D()(x)
print(x.shape)



In [None]:
# Define the AttentionBlock
class AttentionBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads):
        super(AttentionBlock, self).__init__()
        self.multi_head_attention = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.norm1 = LayerNormalization(epsilon=1e-6)  # Post-attention normalization
        self.norm2 = LayerNormalization(epsilon=1e-6)  # Post-dense normalization
        self.dense = Dense(embed_dim, activation='relu')
        self.add = Add()

    def build(self, input_shape):
        # This method can be used to create variables used by the layer
        super(AttentionBlock, self).build(input_shape)

    def call(self, inputs):
        attn_output = self.multi_head_attention(inputs, inputs)
        out1 = self.norm1(self.add([inputs, attn_output]))  # Residual connection + normalization
        dense_output = self.dense(out1)
        return self.norm2(self.add([out1, dense_output]))  # Residual connection + normalization

# Set parameters for the AttentionBlock
embed_dim = 2048  # Match the output dimension of GlobalAveragePooling2D
num_heads = 8

In [None]:
# Define a custom layer to handle dimension expansion and squeezing
class ExpandDimsLayer(tf.keras.layers.Layer):
    def call(self, inputs):
        return tf.expand_dims(inputs, axis=1)

class SqueezeDimsLayer(tf.keras.layers.Layer):
    def call(self, inputs):
        return tf.squeeze(inputs, axis=1)

# Add the AttentionBlock to the model
x = ExpandDimsLayer()(x)  # Expand dimensions to add a sequence length of 1
x = AttentionBlock(embed_dim, num_heads)(x)
x = SqueezeDimsLayer()(x)  # Squeeze dimensions back to remove the sequence length of 1
x = Dense(512, activation='relu')(x)
x = Dropout(0.5)(x)
predictions = Dense(10, activation='softmax')(x)

# Define the model
model = Model(inputs=base_model.input, outputs=predictions)

# # Set trainable layers
# trainable_layers = ['ExpandDimsLayer', 'AttentionBlock', 'SqueezeDimsLayer', 'dense', 'predictions']

# for layer in model.layers:
#     if layer.name.split('/')[0] in trainable_layers:
#         layer.trainable = True
#     else:
#         layer.trainable = False


#Set all layers to be trainable
for layer in base_model.layers:
    layer.trainable = True


# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_test, y_test))

# Evaluate the model
test_loss, test_accuracy = model.evaluate(x_test, y_test)
print(f"Test accuracy: {test_accuracy}")

# Predict labels for test set
predictions = model.predict(x_test)
predicted_labels = tf.argmax(predictions, axis=1)
actual_labels = tf.argmax(y_test, axis=1)

# Print overall accuracy
accuracy = tf.reduce_mean(tf.cast(predicted_labels == actual_labels, tf.float32))
print(f"Overall accuracy: {accuracy.numpy()}")

We should avoid using tf.expand_dims and tf.squeeze directly on Keras tensors and instead wrap them within a custom layer or use appropriate Keras layers.


Note that ResNet-50 might be too complex for a relatively small dataset like CIFAR-10, and using a smaller architecture like VGG16 could be more appropriate. We will train the model from scratch since the pretrained weights of imagenet data, may not be very useful to us

# VGG 16

In [None]:
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.layers import Dense, Flatten, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.utils import to_categorical

# Load CIFAR-10 dataset
(x_train, y_train), (x_test, y_test) = cifar10.load_data()

# Normalize the data
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0

# Convert class vectors to binary class matrices (one-hot encoding)
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)

# Load the VGG16 model with pre-trained ImageNet weights, excluding the top layer
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(32, 32, 3))

# Fine-tune the model by unfreezing some layers
for layer in base_model.layers[:-4]:
    layer.trainable = False

# Add custom layers on top of the base model
x = base_model.output
x = Flatten()(x)
x = Dense(256, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)  # Add dropout for regularization
predictions = Dense(10, activation='softmax')(x)  # 10 classes for CIFAR-10

# Define the model
model = Model(inputs=base_model.input, outputs=predictions)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(x_train, y_train, epochs=25, batch_size=32, validation_data=(x_test, y_test))

# Evaluate the model
test_loss, test_accuracy = model.evaluate(x_test, y_test)
print(f"Test accuracy: {test_accuracy}")

# Predict labels for test set
predictions = model.predict(x_test)
predicted_labels = tf.argmax(predictions, axis=1)
actual_labels = tf.argmax(y_test, axis=1)

# Print overall accuracy
accuracy = tf.reduce_mean(tf.cast(predicted_labels == actual_labels, tf.float32))
print(f"Overall accuracy: {accuracy.numpy()}")


# ResNet9 Architecture

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, Model, optimizers
import numpy as np


from tensorflow.keras import layers, Sequential

# Custom padding layer
class CustomPaddingLayer(layers.Layer):
    def __init__(self, padding):
        super(CustomPaddingLayer, self).__init__()
        self.padding = padding

    def call(self, inputs):
        return tf.pad(inputs, [[0, 0], [self.padding, self.padding], [self.padding, self.padding], [0, 0]], mode='CONSTANT')

# Define the  layers in TensorFlow
conv1 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(64, kernel_size=(3, 3), strides=(1, 1), padding='valid', input_shape=(32, 32, 3)),
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU()
])

input_tensor = tf.random.normal([64, 32, 32, 3])  # Assuming the input shape after previous layers
output_tensor1 = conv1(input_tensor)

print("Layer after Conv1",output_tensor1.shape)


conv2 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(128, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU(),
    layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid')  # Max Pooling
])


input_tensor = tf.random.normal([64, 32, 32, 64])  # Assuming the input shape after previous layers
output_tensor1 = conv2(input_tensor)
print("Layer after Conv2",output_tensor1.shape)


res1_block0 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(128, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU()
])


input_tensor = tf.random.normal([64, 16, 16, 128])  # Assuming the input shape after previous layers
output_tensor1 = res1_block0(input_tensor)
print("Layer after res1_block0",output_tensor1.shape)



res1_block1 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(128, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU()
])


input_tensor = tf.random.normal([64, 16, 16, 128])  # Assuming the input shape after previous layers
output_tensor1 = res1_block1(input_tensor)
print("Layer after res1_block1",output_tensor1.shape)


conv3 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(256, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU(),
    layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid')  # Max pooling
])

input_tensor = tf.random.normal([64, 16, 16, 128])  # Assuming the input shape after previous layers
output_tensor1 = conv3(input_tensor)
print("Layer after Conv3",output_tensor1.shape)


conv4 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(512, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU(),
    layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid')  # Max pooling
])

input_tensor = tf.random.normal([64, 8, 8, 256])  # Assuming the input shape after previous layers
output_tensor1 = conv4(input_tensor)
print("Layer after Conv4",output_tensor1.shape)


res2_block0 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(512, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU()
])

input_tensor = tf.random.normal([64, 4,4, 512])  # Assuming the input shape after previous layers
output_tensor1 = res2_block0(input_tensor)
print("Layer after res2_block0",output_tensor1.shape)



res2_block1 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(512, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU()
])

input_tensor = tf.random.normal([64, 4,4, 512])  # Assuming the input shape after previous layers
output_tensor1 = res2_block1(input_tensor)
print("Layer after res2_block1",output_tensor1.shape)



classifier = Sequential([
    layers.MaxPooling2D(pool_size=(2, 2), strides=1, padding='valid'),  # Max pooling
    layers.Flatten(),  # Flatten layer
    layers.Dropout(0.2),  # Dropout with a dropout rate of 0.2
    layers.Dense(10, activation='softmax')  # Fully connected layer with softmax activation for classification
])

# Example input to test the layer
input_tensor = tf.random.normal([64, 4, 4, 512])  # Assuming the input shape after previous layers
output_tensor = classifier(input_tensor)

print("classifier_layer",output_tensor.shape)


In [None]:
from tensorflow.keras import Sequential

# Create a Sequential model
model = Sequential()

# Add layers to the model sequentially
model.add(conv1)
model.add(conv2)
model.add(res1_block0)
model.add(res1_block1)
model.add(conv3)
model.add(conv4)
model.add(res2_block0)
model.add(res2_block1)
model.add(classifier)

# Print model summary
model.summary()


In [None]:
from tensorflow.keras.datasets import cifar10

# Load CIFAR-10 dataset
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0


# Define data augmentation
data_augmentation = Sequential([
    layers.RandomFlip("horizontal"),
    layers.RandomRotation(0.1),
])

# Compile the model
model = Sequential([
    data_augmentation,
    conv1, conv2, res1_block0, res1_block1, conv3, conv4, res2_block0, res2_block1, classifier
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(x_train, y_train, batch_size=64, epochs=20, validation_data=(x_test, y_test))

# ResNet9+Attention

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, Model, optimizers
from tensorflow.keras.layers import Dense, Flatten, Input, LayerNormalization, MultiHeadAttention, Add, Dropout,GlobalAveragePooling2D
import numpy as np


from tensorflow.keras import layers, Sequential

# Custom padding layer
class CustomPaddingLayer(layers.Layer):
    def __init__(self, padding):
        super(CustomPaddingLayer, self).__init__()
        self.padding = padding

    def call(self, inputs):
        return tf.pad(inputs, [[0, 0], [self.padding, self.padding], [self.padding, self.padding], [0, 0]], mode='CONSTANT')

# Define the  layers in TensorFlow
conv1 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(64, kernel_size=(3, 3), strides=(1, 1), padding='valid', input_shape=(32, 32, 3)),
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU()
])

input_tensor = tf.random.normal([64, 32, 32, 3])  # Assuming the input shape after previous layers
output_tensor1 = conv1(input_tensor)

print("Layer after Conv1",output_tensor1.shape)


conv2 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(128, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU(),
    layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid')  # Max Pooling
])


input_tensor = tf.random.normal([64, 32, 32, 64])  # Assuming the input shape after previous layers
output_tensor1 = conv2(input_tensor)
print("Layer after Conv2",output_tensor1.shape)


res1_block0 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(128, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU()
])


input_tensor = tf.random.normal([64, 16, 16, 128])  # Assuming the input shape after previous layers
output_tensor1 = res1_block0(input_tensor)
print("Layer after res1_block0",output_tensor1.shape)



res1_block1 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(128, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU()
])


input_tensor = tf.random.normal([64, 16, 16, 128])  # Assuming the input shape after previous layers
output_tensor1 = res1_block1(input_tensor)
print("Layer after res1_block1",output_tensor1.shape)


conv3 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(256, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU(),
    layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid')  # Max pooling
])

input_tensor = tf.random.normal([64, 16, 16, 128])  # Assuming the input shape after previous layers
output_tensor1 = conv3(input_tensor)
print("Layer after Conv3",output_tensor1.shape)


conv4 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(512, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU(),
    layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid')  # Max pooling
])

input_tensor = tf.random.normal([64, 8, 8, 256])  # Assuming the input shape after previous layers
output_tensor1 = conv4(input_tensor)
print("Layer after Conv4",output_tensor1.shape)


res2_block0 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(512, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU()
])

input_tensor = tf.random.normal([64, 4,4, 512])  # Assuming the input shape after previous layers
output_tensor1 = res2_block0(input_tensor)
print("Layer after res2_block0",output_tensor1.shape)



res2_block1 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(512, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU()
])

input_tensor = tf.random.normal([64, 4,4, 512])  # Assuming the input shape after previous layers
output_tensor1 = res2_block1(input_tensor)
print("Layer after res2_block1",output_tensor1.shape)



classifier = Sequential([
    layers.MaxPooling2D(pool_size=(2, 2), strides=1, padding='valid'),  # Max pooling
    layers.Flatten(),  # Flatten layer
    layers.Dropout(0.2),  # Dropout with a dropout rate of 0.2
    layers.Dense(10, activation='softmax')  # Fully connected layer with softmax activation for classification
])

# Example input to test the layer
input_tensor = tf.random.normal([64, 4, 4, 512])  # Assuming the input shape after previous layers
output_tensor = classifier(input_tensor)

print("classifier_layer",output_tensor.shape)


In [None]:


# Define the AttentionBlock
class AttentionBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads):
        super(AttentionBlock, self).__init__()
        self.multi_head_attention = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.norm1 = LayerNormalization(epsilon=1e-6)  # Post-attention normalization
        self.norm2 = LayerNormalization(epsilon=1e-6)  # Post-dense normalization
        self.dense = Dense(embed_dim, activation='relu')
        self.add = Add()

    def build(self, input_shape):
        # This method can be used to create variables used by the layer
        super(AttentionBlock, self).build(input_shape)

    def call(self, inputs):
        attn_output = self.multi_head_attention(inputs, inputs)
        out1 = self.norm1(self.add([inputs, attn_output]))  # Residual connection + normalization
        dense_output = self.dense(out1)
        return self.norm2(self.add([out1, dense_output]))  # Residual connection + normalization

# Set parameters for the AttentionBlock
embed_dim = 2048  # Match the output dimension of GlobalAveragePooling2D
num_heads = 8

In [None]:
from tensorflow.keras import Sequential

# Create a Sequential model
model = Sequential()

# Add layers to the model sequentially
model.add(conv1)
model.add(conv2)

model.add(res1_block0)

# Add the AttentionBlock here
attention_block = AttentionBlock(embed_dim, num_heads)
model.add(attention_block)

model.add(res1_block1)

# Add the AttentionBlock here
attention_block = AttentionBlock(embed_dim, num_heads)
model.add(attention_block)

model.add(conv3)
model.add(conv4)

model.add(res2_block0)

# Add the AttentionBlock here
attention_block = AttentionBlock(embed_dim, num_heads)
model.add(attention_block)

model.add(res2_block1)

# Add the AttentionBlock here
attention_block = AttentionBlock(embed_dim, num_heads)
model.add(attention_block)

model.add(classifier)

# Print model summary
model.summary()


In [None]:
from tensorflow.keras.datasets import cifar10

# Load CIFAR-10 dataset
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0


# Define data augmentation
data_augmentation = Sequential([
    layers.RandomFlip("horizontal"),
    layers.RandomRotation(0.1),
])

# Compile the model
model = Sequential([
    data_augmentation,
    conv1, conv2, res1_block0, res1_block1, conv3, conv4, res2_block0, res2_block1, classifier
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(x_train, y_train, batch_size=64, epochs=20, validation_data=(x_test, y_test))

# This time we will train the model without data augmentation

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, Model, optimizers
import numpy as np


from tensorflow.keras import layers, Sequential

# Custom padding layer
class CustomPaddingLayer(layers.Layer):
    def __init__(self, padding):
        super(CustomPaddingLayer, self).__init__()
        self.padding = padding

    def call(self, inputs):
        return tf.pad(inputs, [[0, 0], [self.padding, self.padding], [self.padding, self.padding], [0, 0]], mode='CONSTANT')

# Define the  layers in TensorFlow
conv1 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(64, kernel_size=(3, 3), strides=(1, 1), padding='valid', input_shape=(32, 32, 3)),
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU()
])

input_tensor = tf.random.normal([64, 32, 32, 3])  # Assuming the input shape after previous layers
output_tensor1 = conv1(input_tensor)

print("Layer after Conv1",output_tensor1.shape)


conv2 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(128, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU(),
    layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid')  # Max Pooling
])


input_tensor = tf.random.normal([64, 32, 32, 64])  # Assuming the input shape after previous layers
output_tensor1 = conv2(input_tensor)
print("Layer after Conv2",output_tensor1.shape)


res1_block0 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(128, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU()
])


input_tensor = tf.random.normal([64, 16, 16, 128])  # Assuming the input shape after previous layers
output_tensor1 = res1_block0(input_tensor)
print("Layer after res1_block0",output_tensor1.shape)



res1_block1 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(128, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU()
])


input_tensor = tf.random.normal([64, 16, 16, 128])  # Assuming the input shape after previous layers
output_tensor1 = res1_block1(input_tensor)
print("Layer after res1_block1",output_tensor1.shape)


conv3 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(256, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU(),
    layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid')  # Max pooling
])

input_tensor = tf.random.normal([64, 16, 16, 128])  # Assuming the input shape after previous layers
output_tensor1 = conv3(input_tensor)
print("Layer after Conv3",output_tensor1.shape)


conv4 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(512, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU(),
    layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid')  # Max pooling
])

input_tensor = tf.random.normal([64, 8, 8, 256])  # Assuming the input shape after previous layers
output_tensor1 = conv4(input_tensor)
print("Layer after Conv4",output_tensor1.shape)


res2_block0 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(512, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU()
])

input_tensor = tf.random.normal([64, 4,4, 512])  # Assuming the input shape after previous layers
output_tensor1 = res2_block0(input_tensor)
print("Layer after res2_block0",output_tensor1.shape)



res2_block1 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(512, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU()
])

input_tensor = tf.random.normal([64, 4,4, 512])  # Assuming the input shape after previous layers
output_tensor1 = res2_block1(input_tensor)
print("Layer after res2_block1",output_tensor1.shape)



classifier = Sequential([
    layers.MaxPooling2D(pool_size=(2, 2), strides=1, padding='valid'),  # Max pooling
    layers.Flatten(),  # Flatten layer
    layers.Dropout(0.2),  # Dropout with a dropout rate of 0.2
    layers.Dense(10, activation='softmax')  # Fully connected layer with softmax activation for classification
])

# Example input to test the layer
input_tensor = tf.random.normal([64, 4, 4, 512])  # Assuming the input shape after previous layers
output_tensor = classifier(input_tensor)

print("classifier_layer",output_tensor.shape)

from tensorflow.keras import Sequential

# Create a Sequential model
model = Sequential()

# Add layers to the model sequentially
model.add(conv1)
model.add(conv2)
model.add(res1_block0)
model.add(res1_block1)
model.add(conv3)
model.add(conv4)
model.add(res2_block0)
model.add(res2_block1)
model.add(classifier)

# Print model summary
model.summary()

from tensorflow.keras.datasets import cifar10

# Load CIFAR-10 dataset
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0




# Compile the model
model = Sequential([
    
    conv1, conv2, res1_block0, res1_block1, conv3, conv4, res2_block0, res2_block1, classifier
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(x_train, y_train, batch_size=64, epochs=20, validation_data=(x_test, y_test))



In [None]:
import tensorflow as tf
from tensorflow.keras import layers, Model, optimizers
from tensorflow.keras.layers import Dense, Flatten, Input, LayerNormalization, MultiHeadAttention, Add, Dropout,GlobalAveragePooling2D
import numpy as np


from tensorflow.keras import layers, Sequential

# Custom padding layer
class CustomPaddingLayer(layers.Layer):
    def __init__(self, padding):
        super(CustomPaddingLayer, self).__init__()
        self.padding = padding

    def call(self, inputs):
        return tf.pad(inputs, [[0, 0], [self.padding, self.padding], [self.padding, self.padding], [0, 0]], mode='CONSTANT')

# Define the  layers in TensorFlow
conv1 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(64, kernel_size=(3, 3), strides=(1, 1), padding='valid', input_shape=(32, 32, 3)),
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU()
])

input_tensor = tf.random.normal([64, 32, 32, 3])  # Assuming the input shape after previous layers
output_tensor1 = conv1(input_tensor)

print("Layer after Conv1",output_tensor1.shape)


conv2 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(128, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU(),
    layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid')  # Max Pooling
])


input_tensor = tf.random.normal([64, 32, 32, 64])  # Assuming the input shape after previous layers
output_tensor1 = conv2(input_tensor)
print("Layer after Conv2",output_tensor1.shape)


res1_block0 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(128, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU()
])


input_tensor = tf.random.normal([64, 16, 16, 128])  # Assuming the input shape after previous layers
output_tensor1 = res1_block0(input_tensor)
print("Layer after res1_block0",output_tensor1.shape)



res1_block1 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(128, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU()
])


input_tensor = tf.random.normal([64, 16, 16, 128])  # Assuming the input shape after previous layers
output_tensor1 = res1_block1(input_tensor)
print("Layer after res1_block1",output_tensor1.shape)


conv3 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(256, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU(),
    layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid')  # Max pooling
])

input_tensor = tf.random.normal([64, 16, 16, 128])  # Assuming the input shape after previous layers
output_tensor1 = conv3(input_tensor)
print("Layer after Conv3",output_tensor1.shape)


conv4 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(512, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU(),
    layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid')  # Max pooling
])

input_tensor = tf.random.normal([64, 8, 8, 256])  # Assuming the input shape after previous layers
output_tensor1 = conv4(input_tensor)
print("Layer after Conv4",output_tensor1.shape)


res2_block0 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(512, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU()
])

input_tensor = tf.random.normal([64, 4,4, 512])  # Assuming the input shape after previous layers
output_tensor1 = res2_block0(input_tensor)
print("Layer after res2_block0",output_tensor1.shape)



res2_block1 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(512, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU()
])

input_tensor = tf.random.normal([64, 4,4, 512])  # Assuming the input shape after previous layers
output_tensor1 = res2_block1(input_tensor)
print("Layer after res2_block1",output_tensor1.shape)



classifier = Sequential([
    layers.MaxPooling2D(pool_size=(2, 2), strides=1, padding='valid'),  # Max pooling
    layers.Flatten(),  # Flatten layer
    layers.Dropout(0.2),  # Dropout with a dropout rate of 0.2
    layers.Dense(10, activation='softmax')  # Fully connected layer with softmax activation for classification
])

# Example input to test the layer
input_tensor = tf.random.normal([64, 4, 4, 512])  # Assuming the input shape after previous layers
output_tensor = classifier(input_tensor)

print("classifier_layer",output_tensor.shape)




# Define the AttentionBlock
class AttentionBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads):
        super(AttentionBlock, self).__init__()
        self.multi_head_attention = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.norm1 = LayerNormalization(epsilon=1e-6)  # Post-attention normalization
        self.norm2 = LayerNormalization(epsilon=1e-6)  # Post-dense normalization
        self.dense = Dense(embed_dim, activation='relu')
        self.add = Add()

    def build(self, input_shape):
        # This method can be used to create variables used by the layer
        super(AttentionBlock, self).build(input_shape)

    def call(self, inputs):
        attn_output = self.multi_head_attention(inputs, inputs)
        out1 = self.norm1(self.add([inputs, attn_output]))  # Residual connection + normalization
        dense_output = self.dense(out1)
        return self.norm2(self.add([out1, dense_output]))  # Residual connection + normalization

# Set parameters for the AttentionBlock
embed_dim = 2048  # Match the output dimension of GlobalAveragePooling2D
num_heads = 8


from tensorflow.keras import Sequential

# Create a Sequential model
model = Sequential()

# Add layers to the model sequentially
model.add(conv1)
model.add(conv2)

model.add(res1_block0)

# Add the AttentionBlock here
attention_block = AttentionBlock(embed_dim, num_heads)
model.add(attention_block)

model.add(res1_block1)

# Add the AttentionBlock here
attention_block = AttentionBlock(embed_dim, num_heads)
model.add(attention_block)

model.add(conv3)
model.add(conv4)

model.add(res2_block0)

# Add the AttentionBlock here
attention_block = AttentionBlock(embed_dim, num_heads)
model.add(attention_block)

model.add(res2_block1)

# Add the AttentionBlock here
attention_block = AttentionBlock(embed_dim, num_heads)
model.add(attention_block)

model.add(classifier)

# Print model summary
model.summary()


from tensorflow.keras.datasets import cifar10

# Load CIFAR-10 dataset
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0




# Compile the model
model = Sequential([
   
    conv1, conv2, res1_block0, res1_block1, conv3, conv4, res2_block0, res2_block1, classifier
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(x_train, y_train, batch_size=64, epochs=20, validation_data=(x_test, y_test))



# Normal CNN without residual blocks and then using attention layers on top of it

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, Model, optimizers
from tensorflow.keras.layers import Dense, Flatten, Input, LayerNormalization, MultiHeadAttention, Add, Dropout,GlobalAveragePooling2D
import numpy as np


from tensorflow.keras import layers, Sequential

# Custom padding layer
class CustomPaddingLayer(layers.Layer):
    def __init__(self, padding):
        super(CustomPaddingLayer, self).__init__()
        self.padding = padding

    def call(self, inputs):
        return tf.pad(inputs, [[0, 0], [self.padding, self.padding], [self.padding, self.padding], [0, 0]], mode='CONSTANT')

# Define the  layers in TensorFlow
conv1 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(64, kernel_size=(3, 3), strides=(1, 1), padding='valid', input_shape=(32, 32, 3)),
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU()
])

input_tensor = tf.random.normal([64, 32, 32, 3])  # Assuming the input shape after previous layers
output_tensor1 = conv1(input_tensor)

print("Layer after Conv1",output_tensor1.shape)


conv2 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(128, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU(),
    layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid')  # Max Pooling
])


input_tensor = tf.random.normal([64, 32, 32, 64])  # Assuming the input shape after previous layers
output_tensor1 = conv2(input_tensor)
print("Layer after Conv2",output_tensor1.shape)



conv3 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(256, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU(),
    layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid')  # Max pooling
])

input_tensor = tf.random.normal([64, 16, 16, 128])  # Assuming the input shape after previous layers
output_tensor1 = conv3(input_tensor)
print("Layer after Conv3",output_tensor1.shape)


conv4 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(512, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU(),
    layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid')  # Max pooling
])

input_tensor = tf.random.normal([64, 8, 8, 256])  # Assuming the input shape after previous layers
output_tensor1 = conv4(input_tensor)
print("Layer after Conv4",output_tensor1.shape)



classifier = Sequential([
    layers.MaxPooling2D(pool_size=(2, 2), strides=1, padding='valid'),  # Max pooling
    layers.Flatten(),  # Flatten layer
    layers.Dropout(0.2),  # Dropout with a dropout rate of 0.2
    layers.Dense(10, activation='softmax')  # Fully connected layer with softmax activation for classification
])

# Example input to test the layer
input_tensor = tf.random.normal([64, 4, 4, 512])  # Assuming the input shape after previous layers
output_tensor = classifier(input_tensor)

print("classifier_layer",output_tensor.shape)




# # Define the AttentionBlock
# class AttentionBlock(tf.keras.layers.Layer):
#     def __init__(self, embed_dim, num_heads):
#         super(AttentionBlock, self).__init__()
#         self.multi_head_attention = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
#         self.norm1 = LayerNormalization(epsilon=1e-6)  # Post-attention normalization
#         self.norm2 = LayerNormalization(epsilon=1e-6)  # Post-dense normalization
#         self.dense = Dense(embed_dim, activation='relu')
#         self.add = Add()

#     def build(self, input_shape):
#         # This method can be used to create variables used by the layer
#         super(AttentionBlock, self).build(input_shape)

#     def call(self, inputs):
#         attn_output = self.multi_head_attention(inputs, inputs)
#         out1 = self.norm1(self.add([inputs, attn_output]))  # Residual connection + normalization
#         dense_output = self.dense(out1)
#         return self.norm2(self.add([out1, dense_output]))  # Residual connection + normalization

# # Set parameters for the AttentionBlock
# embed_dim = 2048  # Match the output dimension of GlobalAveragePooling2D
# num_heads = 8


from tensorflow.keras import Sequential

# Create a Sequential model
model = Sequential()

# Add layers to the model sequentially
model.add(conv1)
model.add(conv2)

# # Add the AttentionBlock here
# attention_block = AttentionBlock(embed_dim, num_heads)
# model.add(attention_block)

# # Add the AttentionBlock here
# attention_block = AttentionBlock(embed_dim, num_heads)
# model.add(attention_block)

model.add(conv3)
model.add(conv4)



# # Add the AttentionBlock here
# attention_block = AttentionBlock(embed_dim, num_heads)
# model.add(attention_block)



# # Add the AttentionBlock here
# attention_block = AttentionBlock(embed_dim, num_heads)
# model.add(attention_block)

model.add(classifier)

# Print model summary
model.summary()


from tensorflow.keras.datasets import cifar10

# Load CIFAR-10 dataset
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0




# Compile the model
model = Sequential([
   
    conv1, conv2,  conv3, conv4,  classifier
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(x_train, y_train, batch_size=64, epochs=20, validation_data=(x_test, y_test))

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, Model, optimizers
from tensorflow.keras.layers import Dense, Flatten, Input, LayerNormalization, MultiHeadAttention, Add, Dropout,GlobalAveragePooling2D
import numpy as np


from tensorflow.keras import layers, Sequential

# Custom padding layer
class CustomPaddingLayer(layers.Layer):
    def __init__(self, padding):
        super(CustomPaddingLayer, self).__init__()
        self.padding = padding

    def call(self, inputs):
        return tf.pad(inputs, [[0, 0], [self.padding, self.padding], [self.padding, self.padding], [0, 0]], mode='CONSTANT')

# Define the  layers in TensorFlow
conv1 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(64, kernel_size=(3, 3), strides=(1, 1), padding='valid', input_shape=(32, 32, 3)),
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU()
])

input_tensor = tf.random.normal([64, 32, 32, 3])  # Assuming the input shape after previous layers
output_tensor1 = conv1(input_tensor)

print("Layer after Conv1",output_tensor1.shape)


conv2 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(128, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU(),
    layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid')  # Max Pooling
])


input_tensor = tf.random.normal([64, 32, 32, 64])  # Assuming the input shape after previous layers
output_tensor1 = conv2(input_tensor)
print("Layer after Conv2",output_tensor1.shape)



conv3 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(256, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU(),
    layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid')  # Max pooling
])

input_tensor = tf.random.normal([64, 16, 16, 128])  # Assuming the input shape after previous layers
output_tensor1 = conv3(input_tensor)
print("Layer after Conv3",output_tensor1.shape)


conv4 = Sequential([
    CustomPaddingLayer(padding=1),  # Adds padding of 1 on each side
    layers.Conv2D(512, kernel_size=(3, 3), strides=(1, 1), padding='valid'),  # Convolution with padding already handled
    layers.BatchNormalization(epsilon=1e-05, momentum=0.1),
    layers.ReLU(),
    layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid')  # Max pooling
])

input_tensor = tf.random.normal([64, 8, 8, 256])  # Assuming the input shape after previous layers
output_tensor1 = conv4(input_tensor)
print("Layer after Conv4",output_tensor1.shape)



classifier = Sequential([
    layers.MaxPooling2D(pool_size=(2, 2), strides=1, padding='valid'),  # Max pooling
    layers.Flatten(),  # Flatten layer
    layers.Dropout(0.2),  # Dropout with a dropout rate of 0.2
    layers.Dense(10, activation='softmax')  # Fully connected layer with softmax activation for classification
])

# Example input to test the layer
input_tensor = tf.random.normal([64, 4, 4, 512])  # Assuming the input shape after previous layers
output_tensor = classifier(input_tensor)

print("classifier_layer",output_tensor.shape)




# Define the AttentionBlock
class AttentionBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads):
        super(AttentionBlock, self).__init__()
        self.multi_head_attention = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.norm1 = LayerNormalization(epsilon=1e-6)  # Post-attention normalization
        self.norm2 = LayerNormalization(epsilon=1e-6)  # Post-dense normalization
        self.dense = Dense(embed_dim, activation='relu')
        self.add = Add()

    def build(self, input_shape):
        # This method can be used to create variables used by the layer
        super(AttentionBlock, self).build(input_shape)

    def call(self, inputs):
        attn_output = self.multi_head_attention(inputs, inputs)
        out1 = self.norm1(self.add([inputs, attn_output]))  # Residual connection + normalization
        dense_output = self.dense(out1)
        return self.norm2(self.add([out1, dense_output]))  # Residual connection + normalization

# Set parameters for the AttentionBlock
embed_dim = 2048  # Match the output dimension of GlobalAveragePooling2D
num_heads = 8


from tensorflow.keras import Sequential

# Create a Sequential model
model = Sequential()

# Add layers to the model sequentially
model.add(conv1)
model.add(conv2)

# Add the AttentionBlock here
attention_block = AttentionBlock(embed_dim, num_heads)
model.add(attention_block)

# Add the AttentionBlock here
attention_block = AttentionBlock(embed_dim, num_heads)
model.add(attention_block)

model.add(conv3)
model.add(conv4)



# Add the AttentionBlock here
attention_block = AttentionBlock(embed_dim, num_heads)
model.add(attention_block)



# Add the AttentionBlock here
attention_block = AttentionBlock(embed_dim, num_heads)
model.add(attention_block)

model.add(classifier)

# Print model summary
model.summary()


from tensorflow.keras.datasets import cifar10

# Load CIFAR-10 dataset
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0




# Compile the model
model = Sequential([
   
    conv1, conv2,  conv3, conv4,  classifier
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(x_train, y_train, batch_size=64, epochs=20, validation_data=(x_test, y_test))

# Understanding Why ViT Trains Badly on Small Datasets: An Intuitive Perspective
Vision transformer (ViT) is an attention neural network architecture that is shown to be effective for computer vision tasks. However, compared to ResNet-18 with a similar number of parameters, ViT has a significantly lower evaluation accuracy when trained on small datasets. To facilitate studies in related fields, we provide a visual intuition to help understand why it is the case. We first compare the performance of the two models and confirm that ViT has less accuracy than ResNet-18 when trained on small datasets. We then interpret the results by showing attention map visualization for ViT and feature map visualization for ResNet-18. The difference is further analyzed through a representation similarity perspective. We conclude that the representation of ViT trained on small datasets is hugely different from ViT trained on large datasets, which may be the reason why the performance drops a lot on small datasets.


https://arxiv.org/abs/2302.03751

https://franky07724-57962.medium.com/once-upon-a-time-in-cifar-10-c26bb056b4ce#:~:text=The%20error%20rate%20of%20a,as%20a%20super%2Dhuman%20performance.