Creating a CLIP-like model (Contrastive Language–Image Pretraining) with multi-head attention in TensorFlow involves building two separate encoders—one for images and one for text—and then aligning their representations using contrastive loss.

Here’s a simplified version of a CLIP-style model using:

A vision encoder (with Conv and MHA)
A text encoder (with Embedding + MHA)
Contrastive loss to bring matching image-text pairs closer in embedding space

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, Model

# ==== Vision Encoder ====
def build_vision_encoder(image_shape=(224, 224, 3), projection_dim=256):
    inputs = layers.Input(shape=image_shape)
    
    x = layers.Conv2D(32, 3, activation='relu')(inputs)
    x = layers.MaxPooling2D()(x)
    x = layers.Conv2D(64, 3, activation='relu')(x)
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dense(projection_dim)(x)
    outputs = layers.LayerNormalization()(x)
    
    return Model(inputs, outputs, name="vision_encoder")

# ==== Text Encoder ====
def build_text_encoder(vocab_size=10000, max_len=40, projection_dim=256):
    inputs = layers.Input(shape=(max_len,))
    x = layers.Embedding(vocab_size, projection_dim)(inputs)
    
    # Multi-head self-attention
    attention_output = layers.MultiHeadAttention(num_heads=4, key_dim=projection_dim)(x, x)
    x = layers.Add()([x, attention_output])
    x = layers.LayerNormalization()(x)
    
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dense(projection_dim)(x)
    outputs = layers.LayerNormalization()(x)
    
    return Model(inputs, outputs, name="text_encoder")

# ==== CLIP Head ====
class CLIPModel(tf.keras.Model):
    def __init__(self, image_shape=(224, 224, 3), vocab_size=10000, max_len=40, projection_dim=256):
        super(CLIPModel, self).__init__()
        self.vision_encoder = build_vision_encoder(image_shape, projection_dim)
        self.text_encoder = build_text_encoder(vocab_size, max_len, projection_dim)
        self.projection_dim = projection_dim

    def compile(self, optimizer):
        super(CLIPModel, self).compile()
        self.optimizer = optimizer

    def compute_loss(self, image_embeddings, text_embeddings):
        # Normalize embeddings
        image_embeddings = tf.math.l2_normalize(image_embeddings, axis=1)
        text_embeddings = tf.math.l2_normalize(text_embeddings, axis=1)

        # Cosine similarity matrix
        logits = tf.matmul(image_embeddings, text_embeddings, transpose_b=True)
        # logits[i][j] is the cosine similarity (dot product of normalized vectors) between: image i and text j
        # So the full logits matrix contains all pairwise similarities between every image-text pair in the batch.
        labels = tf.range(tf.shape(logits)[0])

        # Cross-entropy loss in both directions
        loss_i2t = tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
        loss_t2i = tf.keras.losses.sparse_categorical_crossentropy(labels, tf.transpose(logits), from_logits=True)
        return (tf.reduce_mean(loss_i2t) + tf.reduce_mean(loss_t2i)) / 2

    def train_step(self, data):
        images, texts = data
        with tf.GradientTape() as tape:
            image_embeddings = self.vision_encoder(images, training=True)
            text_embeddings = self.text_encoder(texts, training=True)
            loss = self.compute_loss(image_embeddings, text_embeddings)

        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        return {"loss": loss}


In [None]:
clip = CLIPModel()
clip.compile(optimizer=tf.keras.optimizers.Adam())

# Dummy data
import numpy as np
batch_size = 8
dummy_images = np.random.rand(batch_size, 224, 224, 3).astype("float32")
dummy_texts = np.random.randint(0, 10000, (batch_size, 40))

# Train step
clip.train_on_batch((dummy_images, dummy_texts))

Notes
This is simplified: the actual CLIP uses Vision Transformers and BPE tokenization with larger models.
You can upgrade the vision encoder to a ViT and use Transformer blocks for text for a closer replica.
Multi-head attention allows the model to focus on different aspects of the input simultaneously.
Want to extend it with Transformer blocks, cosine similarity logits scaling, or pretrained backbones like ResNet/BERT? Let me know!