In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
# --- Patch Embedding Layer ---
class PatchEmbedding(tf.keras.layers.Layer):
    def __init__(self, patch_size, projection_dim):
        super().__init__()
        self.patch_size = patch_size
        self.projection_dim = projection_dim

    def build(self, input_shape):
        self.num_patches = (input_shape[1] // self.patch_size) * (input_shape[2] // self.patch_size)
        self.projection = layers.Dense(self.projection_dim)

    def call(self, images):
        batch_size = tf.shape(images)[0]
        patches = tf.image.extract_patches(
            images=images,
            sizes=[1, self.patch_size, self.patch_size, 1],
            strides=[1, self.patch_size, self.patch_size, 1],
            rates=[1, 1, 1, 1],
            padding='VALID'
        )
        patches = tf.reshape(patches, [batch_size, -1, patches.shape[-1]])
        return self.projection(patches)



In [3]:
# --- Transformer Encoder Block ---
def transformer_encoder(inputs, num_heads, projection_dim, transformer_units, dropout_rate):
    x = layers.LayerNormalization(epsilon=1e-6)(inputs)
    x = layers.MultiHeadAttention(num_heads=num_heads, key_dim=projection_dim)(x, x)
    x = layers.Dropout(dropout_rate)(x)
    x = x + inputs

    x_res = layers.LayerNormalization(epsilon=1e-6)(x)
    x_res = layers.Dense(transformer_units, activation="relu")(x_res)
    x_res = layers.Dense(projection_dim)(x_res)
    x_res = layers.Dropout(dropout_rate)(x_res)

    return x + x_res

In [4]:
# --- Build Vision Transformer Model ---
def create_vit_model(
    image_size=(146, 81, 3),
    patch_size=13,
    num_layers=5,
    num_heads=6,
    projection_dim=32,
    transformer_units=192,
    dropout=0.15000000000000002
):
    inputs = tf.keras.Input(shape=image_size)
    x = PatchEmbedding(patch_size, projection_dim)(inputs)

    # Dynamic positional encoding based on patch sequence length
    sequence_length = tf.shape(x)[1]
    positions = tf.range(start=0, limit=sequence_length, delta=1)
    position_embedding = layers.Embedding(input_dim=1000, output_dim=projection_dim)(positions)
    x = x + position_embedding

    for _ in range(num_layers):
        x = transformer_encoder(x, num_heads, projection_dim, transformer_units, dropout)

    x = layers.LayerNormalization(epsilon=1e-6)(x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(dropout)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)

    model = keras.Model(inputs, outputs)
    model.compile(
        optimizer='adam',
        loss="binary_crossentropy",
        metrics=["accuracy"]
    )
    return model


In [5]:
model = create_vit_model()




ValueError: A KerasTensor cannot be used as input to a TensorFlow function. A KerasTensor is a symbolic placeholder for a shape and dtype, used when constructing Keras Functional models or Keras Functions. You can only use it as input to a Keras layer or a Keras operation (from the namespaces `keras.layers` and `keras.operations`). You are likely doing something like:

```
x = Input(...)
...
tf_fn(x)  # Invalid.
```

What you should do instead is wrap `tf_fn` in a layer:

```
class MyLayer(Layer):
    def call(self, x):
        return tf_fn(x)

x = MyLayer()(x)
```


In [6]:
keras.utils.plot_model(model, show_shapes=True, to_file="vit_model_architecture.png")


NameError: name 'model' is not defined