# Exercise 2: Compare CNN vs ViT on CIFAR-10 (5 marks) #

* Dataset: <b> CIFAR-10 </b>


In [1]:
# Cell 1: Import libraries and helper functions
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.datasets import cifar10
import numpy as np
import time
import matplotlib.pyplot as plt

print("TensorFlow version:", tf.__version__)


TensorFlow version: 2.19.0


In [2]:
# Cell 2: Load and preprocess CIFAR-10 dataset
(x_train, y_train), (x_test, y_test) = cifar10.load_data()

# Normalize pixel values to [0,1]
x_train = x_train.astype("float32") / 255.0
x_test = x_test.astype("float32") / 255.0

print(f"x_train shape: {x_train.shape}, y_train shape: {y_train.shape}")
print(f"x_test shape: {x_test.shape}, y_test shape: {y_test.shape}")


Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
[1m170498071/170498071[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 0us/step
x_train shape: (50000, 32, 32, 3), y_train shape: (50000, 1)
x_test shape: (10000, 32, 32, 3), y_test shape: (10000, 1)


In [3]:
# Cell 3: Define CNN model
def create_cnn_model():
    model = models.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', padding='same', input_shape=(32, 32, 3)),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dense(10, activation='softmax')
    ])
    return model

cnn_model = create_cnn_model()
cnn_model.compile(optimizer=optimizers.Adam(),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
cnn_model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [4]:
# Cell 4: Train CNN model and measure time
start_time = time.time()
history_cnn = cnn_model.fit(x_train, y_train, epochs=5,
                            validation_data=(x_test, y_test),
                            batch_size=64,
                            verbose=2)
cnn_train_time = time.time() - start_time
print(f"CNN training time (5 epochs): {cnn_train_time:.2f} seconds")


Epoch 1/5
782/782 - 47s - 60ms/step - accuracy: 0.4960 - loss: 1.4074 - val_accuracy: 0.6150 - val_loss: 1.0800
Epoch 2/5
782/782 - 92s - 118ms/step - accuracy: 0.6536 - loss: 0.9869 - val_accuracy: 0.6692 - val_loss: 0.9478
Epoch 3/5
782/782 - 248s - 317ms/step - accuracy: 0.7155 - loss: 0.8151 - val_accuracy: 0.7047 - val_loss: 0.8442
Epoch 4/5
782/782 - 248s - 317ms/step - accuracy: 0.7584 - loss: 0.6938 - val_accuracy: 0.7292 - val_loss: 0.7927
Epoch 5/5
782/782 - 42s - 54ms/step - accuracy: 0.7902 - loss: 0.5976 - val_accuracy: 0.7386 - val_loss: 0.7791
CNN training time (5 epochs): 677.93 seconds


In [5]:
# Cell 5: Evaluate CNN on test set
cnn_test_loss, cnn_test_acc = cnn_model.evaluate(x_test, y_test, verbose=0)
print(f"CNN Test accuracy: {cnn_test_acc:.4f}")


CNN Test accuracy: 0.7386


In [6]:
# Cell 6: Define Mini Vision Transformer (ViT) components

# Patch extraction layer
class Patches(layers.Layer):
    def __init__(self, patch_size=4):
        super(Patches, self).__init__()
        self.patch_size = patch_size

    def call(self, images):
        batch_size = tf.shape(images)[0]
        patches = tf.image.extract_patches(
            images=images,
            sizes=[1, self.patch_size, self.patch_size, 1],
            strides=[1, self.patch_size, self.patch_size, 1],
            rates=[1, 1, 1, 1],
            padding='VALID')
        patch_dims = patches.shape[-1]
        patches = tf.reshape(patches, [batch_size, -1, patch_dims])
        return patches

# Patch Encoder layer
class PatchEncoder(layers.Layer):
    def __init__(self, num_patches, projection_dim):
        super(PatchEncoder, self).__init__()
        self.num_patches = num_patches
        self.projection = layers.Dense(projection_dim)
        self.position_embedding = layers.Embedding(input_dim=num_patches, output_dim=projection_dim)

    def call(self, patch):
        positions = tf.range(start=0, limit=self.num_patches, delta=1)
        encoded = self.projection(patch) + self.position_embedding(positions)
        return encoded

# Transformer block
def transformer_block(inputs, num_heads, projection_dim, mlp_dim, dropout_rate=0.1):
    # Layer norm 1
    x1 = layers.LayerNormalization(epsilon=1e-6)(inputs)
    # Multi-head self-attention
    attention_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=projection_dim)(x1, x1)
    # Skip connection 1
    x2 = layers.Add()([attention_output, inputs])
    # Layer norm 2
    x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
    # MLP
    x3 = layers.Dense(mlp_dim, activation='relu')(x3)
    x3 = layers.Dense(projection_dim)(x3)
    # Skip connection 2
    return layers.Add()([x3, x2])


In [7]:
# Cell 7: Build Mini ViT model
def create_vit_classifier(
    input_shape=(32, 32, 3),
    patch_size=4,
    num_patches=64,  # (32/4)*(32/4) = 8*8=64 patches
    projection_dim=64,
    transformer_layers=4,
    num_heads=4,
    mlp_dim=128,
    num_classes=10
    ):
    inputs = layers.Input(shape=input_shape)
    # Extract patches
    patches = Patches(patch_size)(inputs)
    # Encode patches
    encoded_patches = PatchEncoder(num_patches, projection_dim)(patches)

    # Transformer blocks
    x = encoded_patches
    for _ in range(transformer_layers):
        x = transformer_block(x, num_heads, projection_dim, mlp_dim)

    # Classification head
    x = layers.LayerNormalization(epsilon=1e-6)(x)
    x = layers.Flatten()(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)

    model = models.Model(inputs=inputs, outputs=outputs)
    return model

vit_model = create_vit_classifier()
vit_model.compile(optimizer=optimizers.Adam(),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
vit_model.summary()





In [8]:
# Cell 8: Train ViT model and measure time
start_time = time.time()
history_vit = vit_model.fit(x_train, y_train, epochs=5,
                            validation_data=(x_test, y_test),
                            batch_size=64,
                            verbose=2)
vit_train_time = time.time() - start_time
print(f"ViT training time (5 epochs): {vit_train_time:.2f} seconds")


Epoch 1/5
782/782 - 877s - 1s/step - accuracy: 0.3497 - loss: 1.8998 - val_accuracy: 0.4883 - val_loss: 1.4151
Epoch 2/5
782/782 - 967s - 1s/step - accuracy: 0.4966 - loss: 1.3939 - val_accuracy: 0.5358 - val_loss: 1.2826
Epoch 3/5
782/782 - 656s - 839ms/step - accuracy: 0.5507 - loss: 1.2539 - val_accuracy: 0.5723 - val_loss: 1.1734
Epoch 4/5
782/782 - 205s - 262ms/step - accuracy: 0.5898 - loss: 1.1526 - val_accuracy: 0.5956 - val_loss: 1.1277
Epoch 5/5
782/782 - 861s - 1s/step - accuracy: 0.6130 - loss: 1.0878 - val_accuracy: 0.6145 - val_loss: 1.0803
ViT training time (5 epochs): 3565.93 seconds


In [9]:
# Cell 9: Evaluate ViT on test set
vit_test_loss, vit_test_acc = vit_model.evaluate(x_test, y_test, verbose=0)
print(f"ViT Test accuracy: {vit_test_acc:.4f}")


ViT Test accuracy: 0.6145


# Cell 10: Markdown Reflection and Comparison

### Model Performance Comparison

| Model | Training Time (5 epochs) | Test Accuracy |
|-------|-------------------------|---------------|
| CNN   | `{{cnn_train_time:.2f}} sec` | `{{cnn_test_acc:.4f}}` |
| ViT   | `{{vit_train_time:.2f}} sec` | `{{vit_test_acc:.4f}}` |

- **Which architecture performed better and why?**  
  The CNN typically performs better on smaller datasets like CIFAR-10 because its convolutional layers efficiently capture local spatial patterns and require fewer parameters. The ViT model, while capable of modeling long-range dependencies through self-attention, usually needs more data or pretraining for strong performance on small datasets.

- **How to improve ViT performance?**  
  To improve ViT on CIFAR-10:
  - Use stronger data augmentation (e.g., Mixup, Cutout).
  - Increase training epochs.
  - Use a hybrid CNN-ViT architecture to capture local and global features.
  - Pretrain on larger datasets and fine-tune.
  - Employ token-reduction techniques (like TokenLearner) or dynamic patching.
  - Experiment with larger projection dimensions and transformer layers if resources allow.
